## 04 Joining data with pandas

In [9]:
import pandas as pd

taxi_owners = pd.read_pickle('taxi_owners.p')
taxi_veh = pd.read_pickle('taxi_vehicles.p')

print(taxi_owners.head(2))

     rid   vid           owner                 address    zip
0  T6285  6285  AGEAN TAXI LLC     4536 N. ELSTON AVE.  60630
1  T4862  4862    MANGIB CORP.  5717 N. WASHTENAW AVE.  60659


In [8]:
print(taxi_veh.head(2))

    vid    make  model  year fuel_type           owner
0  2767  TOYOTA  CAMRY  2013    HYBRID  SEYED M. BADRI
1  1411  TOYOTA   RAV4  2017    HYBRID     DESZY CORP.


## First inner joint

We notice that vid is the only variable common to both DataFrames. So we merge on vid.

In [11]:
# Merge the taxi_owners and taxi_veh tables
taxi_own_veh = taxi_owners.merge(taxi_veh, on = 'vid')

# Print the column names of the taxi_own_veh
print(taxi_own_veh.columns)


Index(['rid', 'vid', 'owner_x', 'address', 'zip', 'make', 'model', 'year',
       'fuel_type', 'owner_y'],
      dtype='object')


In [12]:
# Merge the taxi_owners and taxi_veh tables setting a suffix
# Note the argument for suffixes is a tuple, not a list
taxi_own_veh = taxi_owners.merge(taxi_veh, on='vid', suffixes = ('_own', '_veh')) 

# Print the column names of taxi_own_veh
print(taxi_own_veh.columns)


Index(['rid', 'vid', 'owner_own', 'address', 'zip', 'make', 'model', 'year',
       'fuel_type', 'owner_veh'],
      dtype='object')


In [13]:
# Print the value_counts to find the most popular fuel_type
print(taxi_own_veh['fuel_type'].value_counts())

HYBRID                    2792
GASOLINE                   611
FLEX FUEL                   89
COMPRESSED NATURAL GAS      27
Name: fuel_type, dtype: int64


## Inner joins and number of rows returned

In [14]:
wards = pd.read_pickle('ward.p')
census = pd.read_pickle('census.p')

In [17]:
print(wards.shape)

(50, 4)


In [18]:
print(census.shape)

(50, 6)


In [19]:
# Merge the wards and census tables on the ward column
wards_census = wards.merge(census, on = 'ward')

# Print the shape of wards_census
print('wards_census table shape:', wards_census.shape)

wards_census table shape: (50, 9)


Merge the wards_altered and census tables on the ward column, and notice the difference in returned rows.

In [20]:
print(wards['ward'].head())


0    1
1    2
2    3
3    4
4    5
Name: ward, dtype: object


In [21]:
# Alter the first value in the ward column
wards_altered = wards.replace(to_replace = wards['ward'][0], value = 61)
print(wards_altered['ward'].head())


0    61
1     2
2     3
3     4
4     5
Name: ward, dtype: object


In [22]:
# Merge the wards_altered and census tables on the ward column
wards_altered_census = wards_altered.merge(census, on = 'ward')

# Print the shape of wards_altered_census
print('wards_altered_census table shape:', wards_altered_census.shape)

wards_altered_census table shape: (49, 9)


## One to many relationships