## Slice of data from an external dictionary

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
our_list = [1, 2, 12]
df = pd.DataFrame({
        'a': [0, 1, 10, 11, 12],
        'b': [5, 4, 3, 2, 1],
        'c': ['X', 'Y', 'Y', 'Y', 'Z'],
})
df.head()


Unnamed: 0,a,b,c
0,0,5,X
1,1,4,Y
2,10,3,Y
3,11,2,Y
4,12,1,Z


In [9]:
df.query('a in @our_list') # we make a slice in which the values of column 'a' are equal to the elements in our_list

Unnamed: 0,a,b,c
1,1,4,Y
4,12,1,Z


In [8]:
our_dict = {0: 10, 1: 11, 2: 12} 
df = pd.DataFrame({
        'a': [0, 1, 10, 11, 12], 
        'b': [5, 4, 3, 2, 1],
        'c': ['X', 'Y', 'Y', 'Y', 'Z'],
})

print(df.query('a in @our_dict')) # we make a slice in which the values of column 'a' are equal to the dictionary keys

   a  b  c
0  0  5  X
1  1  4  Y


In [10]:
our_series = pd.Series([10, 11, 12])
df.query('a in @our_series')

Unnamed: 0,a,b,c
2,10,3,Y
3,11,2,Y
4,12,1,Z


In [11]:
df.query('a in @our_series.index') 
# we make a slice in which the values of column 'a' are equal to the Series indices (i.e. 0, 1, or 2)

Unnamed: 0,a,b,c
0,0,5,X
1,1,4,Y


In [12]:
our_df = pd.DataFrame ({
        'a1': [2, 4, 6],
        'b1': [3, 2, 2],
        'c1': ['A', 'B', 'C'],
})
print(df.query('a in @our_df.index')) 
# we make a slice in which the values of column 'a' are equal to the our_df DataFrame indices (i.e., 0, 1, or 2)

   a  b  c
0  0  5  X
1  1  4  Y


In [13]:
print(df.query('b in @our_df.b1')) 
# we make a slice in which the values of column 'b' are equal to the values of column b1 of the 

    a  b  c
2  10  3  Y
3  11  2  Y


In [None]:
# filter abnormally fast and slow visits and gas stations
data['too_fast'] = data['time_spent'] < 60
data['too_slow'] = data['time_spent'] > 1000
too_fast_stat = data.pivot_table(index='id', values='too_fast')
good_ids = too_fast_stat.query('too_fast < 0.5')
good_data = data.query('id in @good_ids.index')
print(len(data))
print(len(good_data))

In [None]:
ood_data = good_data.query('time_spent >= 60 and time_spent <= 1000')
print(len(good_data))

In [None]:
# consider data by individual gas station and by chains
station_stat = data.pivot_table(index='id', values='time_spent', aggfunc='median')
good_stations_stat = good_data.pivot_table(index='id', values='time_spent', aggfunc='median')
good_stations_stat.hist(bins=50)

In [14]:
good_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
print(good_stat.sort_values(by='time_spent', ascending=True))

NameError: name 'good_data' is not defined

## Adding a column

In [15]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
print(df1)
print()
print(df2)

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

   c  d  e
0  3  V  3
1  4  W  3
2  5  X  3
3  6  Y  3
4  7  Z  3


In [16]:
df1['new'] = df2['d']
print()
print(df1)


   a  b new
0  1  Q   V
1  2  R   W
2  3  S   X
3  3  T   Y
4  3  U   Z


In [17]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'new': ['Q', 'R', 'S', 'T', 'U']}) # there is already the new column
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
print(df1)
print()
print(df2)
df1['new'] = df2['d']
print()
print(df1)

   a new
0  1   Q
1  2   R
2  3   S
3  3   T
4  3   U

   c  d  e
0  3  V  3
1  4  W  3
2  5  X  3
3  6  Y  3
4  7  Z  3

   a new
0  1   V
1  2   W
2  3   X
3  3   Y
4  3   Z


In [18]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
df2.set_index('c', inplace=True)
print(df1)
print()
print(df2)
df1['new'] = df2['d']
print()
print(df1)

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

   d  e
c      
3  V  3
4  W  3
5  X  3
6  Y  3
7  Z  3

   a  b  new
0  1  Q  NaN
1  2  R  NaN
2  3  S  NaN
3  3  T    V
4  3  U    W


In [19]:
df1.set_index('a', inplace=True)
print(df1)
print()
print(df2)
print()
df1['new'] = df2['d']
print(df1)

   b  new
a        
1  Q  NaN
2  R  NaN
3  S  NaN
3  T    V
3  U    W

   d  e
c      
3  V  3
4  W  3
5  X  3
6  Y  3
7  Z  3

   b  new
a        
1  Q  NaN
2  R  NaN
3  S    V
3  T    V
3  U    V


In [21]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
df2.set_index('e', inplace=True)
print(df1)
print()
print(df2)
print()
try:
    df1['new'] = df2['d']
except:
    print('There is a duplicate axis.')

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

   c  d
e      
3  3  V
3  4  W
3  5  X
3  6  Y
3  7  Z

There is a duplicate axis.


In [22]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
print(df1)
df1.set_index('a', inplace=True)
series = pd.Series([1, 2, 3, 4, 5])
print()
print(series)
df1['new'] = series
print()
print(df1)

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

0    1
1    2
2    3
3    4
4    5
dtype: int64

   b  new
a        
1  Q    2
2  R    3
3  S    4
3  T    4
3  U    4


In [24]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df1.set_index('a', inplace=True)
print(df1)
list_values = [1, 2, 3, 4, 5]
df1['new'] = list_values
print()
print(df1)
#Indices of df1 are ignored, and the values in new are written in order.

   b
a   
1  Q
2  R
3  S
3  T
3  U

   b  new
a        
1  Q    1
2  R    2
3  S    3
3  T    4
3  U    5
