## Slice of data from an external dictionary

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
our_list = [1, 2, 12]
df = pd.DataFrame({
        'a': [0, 1, 10, 11, 12],
        'b': [5, 4, 3, 2, 1],
        'c': ['X', 'Y', 'Y', 'Y', 'Z'],
})
df.head()


Unnamed: 0,a,b,c
0,0,5,X
1,1,4,Y
2,10,3,Y
3,11,2,Y
4,12,1,Z


In [9]:
df.query('a in @our_list') # we make a slice in which the values of column 'a' are equal to the elements in our_list

Unnamed: 0,a,b,c
1,1,4,Y
4,12,1,Z


In [3]:
our_dict = {0: 10, 1: 11, 2: 12} 
df = pd.DataFrame({
        'a': [0, 1, 10, 11, 12], 
        'b': [5, 4, 3, 2, 1],
        'c': ['X', 'Y', 'Y', 'Y', 'Z'],
})

print(df.query('a in @our_dict')) # we make a slice in which the values of column 'a' are equal to the dictionary keys

    a  b  c
0   0  5  X
1   1  4  Y
2  10  3  Y
3  11  2  Y
4  12  1  Z

   a  b  c
0  0  5  X
1  1  4  Y


In [10]:
our_series = pd.Series([10, 11, 12])
df.query('a in @our_series')

Unnamed: 0,a,b,c
2,10,3,Y
3,11,2,Y
4,12,1,Z


In [11]:
df.query('a in @our_series.index') 
# we make a slice in which the values of column 'a' are equal to the Series indices (i.e. 0, 1, or 2)

Unnamed: 0,a,b,c
0,0,5,X
1,1,4,Y


In [5]:
our_df = pd.DataFrame ({
        'a1': [2, 4, 6],
        'b1': [3, 2, 2],
        'c1': ['A', 'B', 'C'],
})
print(df)
print()
print(our_df)
print()
print(df.query('a in @our_df.index')) 
# we make a slice in which the values of column 'a' are equal to the our_df DataFrame indices (i.e., 0, 1, or 2)

    a  b  c
0   0  5  X
1   1  4  Y
2  10  3  Y
3  11  2  Y
4  12  1  Z

   a1  b1 c1
0   2   3  A
1   4   2  B
2   6   2  C

   a  b  c
0  0  5  X
1  1  4  Y


In [13]:
print(df.query('b in @our_df.b1')) 
# we make a slice in which the values of column 'b' are equal to the values of column b1 of the 

    a  b  c
2  10  3  Y
3  11  2  Y


In [None]:
# filter abnormally fast and slow visits and gas stations
data['too_fast'] = data['time_spent'] < 60
data['too_slow'] = data['time_spent'] > 1000
too_fast_stat = data.pivot_table(index='id', values='too_fast')
good_ids = too_fast_stat.query('too_fast < 0.5')
good_data = data.query('id in @good_ids.index')
print(len(data))
print(len(good_data))

In [None]:
ood_data = good_data.query('time_spent >= 60 and time_spent <= 1000')
print(len(good_data))

In [None]:
# consider data by individual gas station and by chains
station_stat = data.pivot_table(index='id', values='time_spent', aggfunc='median')
good_stations_stat = good_data.pivot_table(index='id', values='time_spent', aggfunc='median')
good_stations_stat.hist(bins=50)

In [14]:
good_stat = good_data.pivot_table(index='name', values='time_spent', aggfunc='median')
print(good_stat.sort_values(by='time_spent', ascending=True))

NameError: name 'good_data' is not defined

## Adding a column

In [15]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
print(df1)
print()
print(df2)

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

   c  d  e
0  3  V  3
1  4  W  3
2  5  X  3
3  6  Y  3
4  7  Z  3


In [16]:
df1['new'] = df2['d']
print()
print(df1)


   a  b new
0  1  Q   V
1  2  R   W
2  3  S   X
3  3  T   Y
4  3  U   Z


In [17]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'new': ['Q', 'R', 'S', 'T', 'U']}) # there is already the new column
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
print(df1)
print()
print(df2)
df1['new'] = df2['d']
print()
print(df1)

   a new
0  1   Q
1  2   R
2  3   S
3  3   T
4  3   U

   c  d  e
0  3  V  3
1  4  W  3
2  5  X  3
3  6  Y  3
4  7  Z  3

   a new
0  1   V
1  2   W
2  3   X
3  3   Y
4  3   Z


In [18]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
df2.set_index('c', inplace=True)
print(df1)
print()
print(df2)
df1['new'] = df2['d']
print()
print(df1)

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

   d  e
c      
3  V  3
4  W  3
5  X  3
6  Y  3
7  Z  3

   a  b  new
0  1  Q  NaN
1  2  R  NaN
2  3  S  NaN
3  3  T    V
4  3  U    W


In [19]:
df1.set_index('a', inplace=True)
print(df1)
print()
print(df2)
print()
df1['new'] = df2['d']
print(df1)

   b  new
a        
1  Q  NaN
2  R  NaN
3  S  NaN
3  T    V
3  U    W

   d  e
c      
3  V  3
4  W  3
5  X  3
6  Y  3
7  Z  3

   b  new
a        
1  Q  NaN
2  R  NaN
3  S    V
3  T    V
3  U    V


In [21]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df2 = pd.DataFrame({'c': [3, 4, 5, 6, 7], 'd': ['V', 'W', 'X', 'Y', 'Z'], 'e': [3, 3, 3, 3, 3]})
df2.set_index('e', inplace=True)
print(df1)
print()
print(df2)
print()
try:
    df1['new'] = df2['d']
except:
    print('There is a duplicate axis.')

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

   c  d
e      
3  3  V
3  4  W
3  5  X
3  6  Y
3  7  Z

There is a duplicate axis.


In [22]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
print(df1)
df1.set_index('a', inplace=True)
series = pd.Series([1, 2, 3, 4, 5])
print()
print(series)
df1['new'] = series
print()
print(df1)

   a  b
0  1  Q
1  2  R
2  3  S
3  3  T
4  3  U

0    1
1    2
2    3
3    4
4    5
dtype: int64

   b  new
a        
1  Q    2
2  R    3
3  S    4
3  T    4
3  U    4


In [24]:
df1 = pd.DataFrame({'a': [1, 2, 3, 3, 3], 'b': ['Q', 'R', 'S', 'T', 'U']})
df1.set_index('a', inplace=True)
print(df1)
list_values = [1, 2, 3, 4, 5]
df1['new'] = list_values
print()
print(df1)
#Indices of df1 are ignored, and the values in new are written in order.

   b
a   
1  Q
2  R
3  S
3  T
3  U

   b  new
a        
1  Q    1
2  R    2
3  S    3
3  T    4
3  U    5


## Combining data from two tables

In [1]:
import pandas as pd
gas_stations = pd.DataFrame({
'Name': ['GasOne', 'GasTwo', 'GasThree', 'GasFour', 'GasFive', 'GasSix', 'GasSeven', 'GasEight', 'GasNine', 'GasTen'],
'Num_check_in': [10000, 100, 110, 105, 113, 102, 125, 116, 109, 120],
'Time': [60, 180, 180, 180, 180, 180, 180, 180, 180, 180,],
})
print (gas_stations)

       Name  Num_check_in  Time
0    GasOne         10000    60
1    GasTwo           100   180
2  GasThree           110   180
3   GasFour           105   180
4   GasFive           113   180
5    GasSix           102   180
6  GasSeven           125   180
7  GasEight           116   180
8   GasNine           109   180
9    GasTen           120   180


In [2]:
print('Mean refueling time:', gas_stations['Time'].mean())
print('Median refueling time:', gas_stations['Time'].median())

Mean refueling time: 168.0
Median refueling time: 180.0


In [3]:
#The pivot_table method groups data, and the aggfunc parameter indicates what needs to be done with it.
import pandas as pd
df = pd.DataFrame({
        'breakfast': ['omelette', 'omelette', 'omelette', 'sandwich', 'sandwich', 'sandwich'], 
        'ingredients': ['eggs', 'milk', 'salt', 'bread', 'ham', 'cheese']
})
print (df)

  breakfast ingredients
0  omelette        eggs
1  omelette        milk
2  omelette        salt
3  sandwich       bread
4  sandwich         ham
5  sandwich      cheese


In [4]:
df.pivot_table(index='breakfast', values='ingredients', aggfunc='first')

Unnamed: 0_level_0,ingredients
breakfast,Unnamed: 1_level_1
omelette,eggs
sandwich,bread


In [5]:
df.pivot_table(index='breakfast', values='ingredients', aggfunc='last')

Unnamed: 0_level_0,ingredients
breakfast,Unnamed: 1_level_1
omelette,salt
sandwich,cheese


In [None]:
id_name = good_data.pivot_table(index='id', values='name', aggfunc=['first', 'count'])

## Renaming columns

In [None]:
df.columns = ['column_name_1', 'column_name_2', 'column_name_3']

In [None]:
id_name.columns = ['name', 'count']
print(id_name.head())

## Combining columns using merge() and join()

In [None]:
good_station_stat['name'] = id_name['name']
good_station_stat['count'] = id_name['count']

In [1]:
import pandas as pd
import numpy as np

In [2]:
first_pupil_df = pd.DataFrame({
        'author': ['Alcott', 'Fitzgerald', 'Steinbeck', 'Twain', 'Hemingway'],
        'title': ['Little Women', 'The Great Gatsby', 'Of Mice and Men', 'The Adventures of Tom Sawyer', 'The Old Man and the Sea']
})
second_pupil_df = pd.DataFrame({
        'author': ['Steinbeck', 'Twain', 'Hemingway', 'Salinger', 'Hawthorne'],
        'title': ['East of Eden', 'The Adventures of Huckleberry Finn', 'For Whom the Bell Tolls', 'The Catcher in the Rye', 'The Scarlett Letter']
})
print(first_pupil_df)
print()
print(second_pupil_df)

       author                         title
0      Alcott                  Little Women
1  Fitzgerald              The Great Gatsby
2   Steinbeck               Of Mice and Men
3       Twain  The Adventures of Tom Sawyer
4   Hemingway       The Old Man and the Sea

      author                               title
0  Steinbeck                        East of Eden
1      Twain  The Adventures of Huckleberry Finn
2  Hemingway             For Whom the Bell Tolls
3   Salinger              The Catcher in the Rye
4  Hawthorne                 The Scarlett Letter


In [3]:
first_pupil_df.merge(second_pupil_df, on='author') 
# name of the column by which the merging will happen is passed in the on parameter

Unnamed: 0,author,title_x,title_y
0,Steinbeck,Of Mice and Men,East of Eden
1,Twain,The Adventures of Tom Sawyer,The Adventures of Huckleberry Finn
2,Hemingway,The Old Man and the Sea,For Whom the Bell Tolls


The result contains only those authors who are present in both parts of the list.
This mode of merging is called inner, because it results in the logical conjunction of both tables (the records that are present in both DataFrames). 
inner is the default merging mode in the merge() method.
Its opposite is outer, the logical disjunction of both tables (the records that are present in either of the two DataFrames). The merging mode is set by the how parameter.

In [4]:
first_pupil_df.merge(second_pupil_df, on='author', how='outer')

Unnamed: 0,author,title_x,title_y
0,Alcott,Little Women,
1,Fitzgerald,The Great Gatsby,
2,Steinbeck,Of Mice and Men,East of Eden
3,Twain,The Adventures of Tom Sawyer,The Adventures of Huckleberry Finn
4,Hemingway,The Old Man and the Sea,For Whom the Bell Tolls
5,Salinger,,The Catcher in the Rye
6,Hawthorne,,The Scarlett Letter


In [5]:
first_pupil_df.merge(second_pupil_df, on='author', how='left')

Unnamed: 0,author,title_x,title_y
0,Alcott,Little Women,
1,Fitzgerald,The Great Gatsby,
2,Steinbeck,Of Mice and Men,East of Eden
3,Twain,The Adventures of Tom Sawyer,The Adventures of Huckleberry Finn
4,Hemingway,The Old Man and the Sea,For Whom the Bell Tolls


In [6]:
first_pupil_df.merge(second_pupil_df, on='author', how='right')

Unnamed: 0,author,title_x,title_y
0,Steinbeck,Of Mice and Men,East of Eden
1,Twain,The Adventures of Tom Sawyer,The Adventures of Huckleberry Finn
2,Hemingway,The Old Man and the Sea,For Whom the Bell Tolls
3,Salinger,,The Catcher in the Rye
4,Hawthorne,,The Scarlett Letter


In [7]:
first_pupil_df.merge(second_pupil_df, on='author', how='left', suffixes=('_wrote the first', '_wrote the second'))

Unnamed: 0,author,title_wrote the first,title_wrote the second
0,Alcott,Little Women,
1,Fitzgerald,The Great Gatsby,
2,Steinbeck,Of Mice and Men,East of Eden
3,Twain,The Adventures of Tom Sawyer,The Adventures of Huckleberry Finn
4,Hemingway,The Old Man and the Sea,For Whom the Bell Tolls


The join() method is similar to the merge() method.
Without the on parameter, join() will automatically seek matches by indices in the first and second DataFrames. 
If the column is passed to the on parameter, then join() finds it in the first DataFrame and begins comparing it's value to the index of the second DataFrame.
Unlike merge(), the combination type how=‘left’ is set in join() by default.
But the suffixes parameter is divided into two independent ones: lsuffix (“left suffix”) and rsuffix (“right suffix”). 
It's also possible to combine more than two tables using the join() method: 
    Their set is passed in a list instead of the second DataFrame.

In [11]:
df1 = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['A', 'B', 'C', 'D']})
df2 = pd.DataFrame({'a': [2, 2, 2, 2], 'c': ['E', 'F', 'G', 'H']})
print(df1)
print()
print(df2)
print()
print (df1.join(df2, on='a', rsuffix='_y')['c']) #prints the c column

   a  b
0  1  A
1  2  B
2  3  C
3  4  D

   a  c
0  2  E
1  2  F
2  2  G
3  2  H

0      F
1      G
2      H
3    NaN
Name: c, dtype: object


In [None]:
 #consider the chains' results from the gas stations' results,
# but not average visits to all of a chain's gas stations
good_stat2 = (
    station_stat_full
    .query('count > 30')
    .pivot_table(index='name', values='time_spent', aggfunc=['median', 'count'])
)
good_stat2.columns = ['median_time', 'stations']
#print(good_stat2.head(), stat.head())
final_stat = stat.merge(good_stat2, on='name', how='left')
print(final_stat)