In [1]:
import pandas as pd

## Combine Side by Side

Merge or Join: Merge can do everything and more than Join

In [2]:
df1 = pd.DataFrame(
    {
      'person': ['Person 1', 'Person 2', 'Person 3'],
      'Age': ['22', '22', '23'],
      'city': ['City 1', 'City 2', 'City 2']
    })

display(df1)

Unnamed: 0,person,Age,city
0,Person 1,22,City 1
1,Person 2,22,City 2
2,Person 3,23,City 2


In [3]:
df2 = pd.DataFrame({
                    'city': ['City 1', 'City 2', 'City 3'],
                    'state': ['ST1', 'ST2', 'ST3']
                   })

display(df2)

Unnamed: 0,city,state
0,City 1,ST1
1,City 2,ST2
2,City 3,ST3


### Merge

In [4]:
# The function recognizes that city is the same for both df, and automatically joins using this column as a key.
pd.merge(df1, df2)

Unnamed: 0,person,Age,city,state
0,Person 1,22,City 1,ST1
1,Person 2,22,City 2,ST2
2,Person 3,23,City 2,ST2


In [5]:
pd.merge(df1, df2)

Unnamed: 0,person,Age,city,state
0,Person 1,22,City 1,ST1
1,Person 2,22,City 2,ST2
2,Person 3,23,City 2,ST2


### Merge with two keys

In [6]:
df1 = pd.DataFrame(
    {
      'person': ['Person 1', 'Person 2', 'Person 3'],
      'Age': ['22', '22', '23'],
      'city': ['City 1', 'City 2', 'City 2'],
       'id' : [1, 2, 3]
    })

display(df1)

Unnamed: 0,person,Age,city,id
0,Person 1,22,City 1,1
1,Person 2,22,City 2,2
2,Person 3,23,City 2,3


In [7]:
df2 = pd.DataFrame({
                    'id' : [1, 2, 3],
                    'city': ['City 1', 'City 2', 'City 3'],
                    'state': ['ST1', 'ST2', 'ST3']
                   })

display(df2)

Unnamed: 0,id,city,state
0,1,City 1,ST1
1,2,City 2,ST2
2,3,City 3,ST3


In [8]:
# Row where person 3 is placed is removed, because id == 3 but, city is city 2.
# in df2, the city in this is case is city 3. It does not match.
# in order to be added, it must matches all the shared colluns
pd.merge(df1, df2)

Unnamed: 0,person,Age,city,id,state
0,Person 1,22,City 1,1,ST1
1,Person 2,22,City 2,2,ST2


In [9]:
# To consider only one collunm, we must specify it on a argument
# _x = left df, _y = right df
pd.merge(df1, df2, on='id') # to specify two, ['id', 'city']

Unnamed: 0,person,Age,city_x,id,city_y,state
0,Person 1,22,City 1,1,City 1,ST1
1,Person 2,22,City 2,2,City 2,ST2
2,Person 3,23,City 2,3,City 3,ST3


In [10]:
pd.merge(df1, df2, on='id', suffixes=['_l', '_r']) 

Unnamed: 0,person,Age,city_l,id,city_r,state
0,Person 1,22,City 1,1,City 1,ST1
1,Person 2,22,City 2,2,City 2,ST2
2,Person 3,23,City 2,3,City 3,ST3


In [11]:
pd.merge(df1, df2, on='id', suffixes=['_l', '_r'], left_index=True, right_index=True)

Unnamed: 0,person,Age,city_l,id,city_r,state
0,Person 1,22,City 1,1,City 1,ST1
1,Person 2,22,City 2,2,City 2,ST2
2,Person 3,23,City 2,3,City 3,ST3


In [12]:
df3 = pd.DataFrame(
    {
      'person': ['Person 1', 'Person 2', 'Person 3'],
      'Age': ['22', '22', '23'],
      'city_1': ['City 1', 'City 2', 'City 2'],
       'id' : [1, 2, 3]
    })

display(df3)

Unnamed: 0,person,Age,city_1,id
0,Person 1,22,City 1,1
1,Person 2,22,City 2,2
2,Person 3,23,City 2,3


In [13]:
df4 = pd.DataFrame(
    {
      'city_2': ['City 1', 'City 2', 'City 3'],
       'id' : [1, 2, 3]
    })

display(df4)

Unnamed: 0,city_2,id
0,City 1,1
1,City 2,2
2,City 3,3


In [14]:
pd.merge(df3, df4, suffixes=['_l', '_r'], left_on=['id', 'city_1'], right_on=['id','city_2'])

Unnamed: 0,person,Age,city_1,id,city_2
0,Person 1,22,City 1,1,City 1
1,Person 2,22,City 2,2,City 2


### Outer join

In [15]:
pd.merge(df1, df2, on=['id', 'city'], suffixes=['_l', '_r'], how='outer') 

Unnamed: 0,person,Age,city,id,state
0,Person 1,22.0,City 1,1,ST1
1,Person 2,22.0,City 2,2,ST2
2,Person 3,23.0,City 2,3,
3,,,City 3,3,ST3


### Inner join

In [16]:
pd.merge(df1, df2, on= ['id', 'city'], suffixes=['_l', '_r'], how='inner') 

Unnamed: 0,person,Age,city,id,state
0,Person 1,22,City 1,1,ST1
1,Person 2,22,City 2,2,ST2


### Left join

In [17]:
pd.merge(df1, df2, on= ['id', 'city'], suffixes=['_l', '_r'], how='left') 

Unnamed: 0,person,Age,city,id,state
0,Person 1,22,City 1,1,ST1
1,Person 2,22,City 2,2,ST2
2,Person 3,23,City 2,3,


### Right join

In [18]:
pd.merge(df1, df2, on= ['id', 'city'], suffixes=['_l', '_r'], how='right') 

Unnamed: 0,person,Age,city,id,state
0,Person 1,22.0,City 1,1,ST1
1,Person 2,22.0,City 2,2,ST2
2,,,City 3,3,ST3


## Concatenation (or append)

In [19]:
pd.concat([df1, df3]) # index is not reset, it's 0,1,2, 0,1,2

Unnamed: 0,person,Age,city,id,city_1
0,Person 1,22,City 1,1,
1,Person 2,22,City 2,2,
2,Person 3,23,City 2,3,
0,Person 1,22,,1,City 1
1,Person 2,22,,2,City 2
2,Person 3,23,,3,City 2


In [20]:
pd.concat([df1, df3], ignore_index=True)

Unnamed: 0,person,Age,city,id,city_1
0,Person 1,22,City 1,1,
1,Person 2,22,City 2,2,
2,Person 3,23,City 2,3,
3,Person 1,22,,1,City 1
4,Person 2,22,,2,City 2
5,Person 3,23,,3,City 2


In [21]:
# keep index col
df = pd.concat([df1, df3])
df.reset_index(inplace=True)
display(df)

Unnamed: 0,index,person,Age,city,id,city_1
0,0,Person 1,22,City 1,1,
1,1,Person 2,22,City 2,2,
2,2,Person 3,23,City 2,3,
3,0,Person 1,22,,1,City 1
4,1,Person 2,22,,2,City 2
5,2,Person 3,23,,3,City 2


In [22]:
# discard index col
df = pd.concat([df1, df3])
df.reset_index(inplace=True, drop=True)
display(df)

Unnamed: 0,person,Age,city,id,city_1
0,Person 1,22,City 1,1,
1,Person 2,22,City 2,2,
2,Person 3,23,City 2,3,
3,Person 1,22,,1,City 1
4,Person 2,22,,2,City 2
5,Person 3,23,,3,City 2


In [23]:
df1.append(df3, sort=True)

Unnamed: 0,Age,city,city_1,id,person
0,22,City 1,,1,Person 1
1,22,City 2,,2,Person 2
2,23,City 2,,3,Person 3
0,22,,City 1,1,Person 1
1,22,,City 2,2,Person 2
2,23,,City 2,3,Person 3


## Using a real dataset

In [24]:
data = pd.read_csv('data/train.csv')
data.head(6)

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2
5,2,,3,266,0,2,5,6,0,2,...,1,1,0,41326,22fe332bf9c924d4718005891c63fbed,0,This is a stray kitten that came to my house. ...,d24c30b4b,2.0,2


In [25]:
state_labels = pd.read_csv('data/state_labels.csv')
state_labels.head()

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan


In [26]:
breed_labels = pd.read_csv('data/BreedLabels.csv')
breed_labels.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type           14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Breed1         14993 non-null  int64  
 4   Breed2         14993 non-null  int64  
 5   Gender         14993 non-null  int64  
 6   Color1         14993 non-null  int64  
 7   Color2         14993 non-null  int64  
 8   Color3         14993 non-null  int64  
 9   MaturitySize   14993 non-null  int64  
 10  FurLength      14993 non-null  int64  
 11  Vaccinated     14993 non-null  int64  
 12  Dewormed       14993 non-null  int64  
 13  Sterilized     14993 non-null  int64  
 14  Health         14993 non-null  int64  
 15  Quantity       14993 non-null  int64  
 16  Fee            14993 non-null  int64  
 17  State          14993 non-null  int64  
 18  Rescue

In [28]:
data[['Breed1', 'Breed2']]

Unnamed: 0,Breed1,Breed2
0,299,0
1,265,0
2,307,0
3,307,0
4,307,0
...,...,...
14988,266,0
14989,265,264
14990,265,266
14991,266,0


In [29]:
merge_breed = data.merge(breed_labels, left_on=['Breed1'], right_on=['BreedID'], how='left')
merge_breed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Breed1         14993 non-null  int64  
 4   Breed2         14993 non-null  int64  
 5   Gender         14993 non-null  int64  
 6   Color1         14993 non-null  int64  
 7   Color2         14993 non-null  int64  
 8   Color3         14993 non-null  int64  
 9   MaturitySize   14993 non-null  int64  
 10  FurLength      14993 non-null  int64  
 11  Vaccinated     14993 non-null  int64  
 12  Dewormed       14993 non-null  int64  
 13  Sterilized     14993 non-null  int64  
 14  Health         14993 non-null  int64  
 15  Quantity       14993 non-null  int64  
 16  Fee            14993 non-null  int64  
 17  State          14993 non-null  int64  
 18  Rescue

In [30]:
display(merge_breed[['Breed1', 'Breed2', 'BreedName', 'Type_x', 'Type_y']])
merge_breed.drop(inplace=True, axis=1, columns=['Type_y', 'Breed1'])

Unnamed: 0,Breed1,Breed2,BreedName,Type_x,Type_y
0,299,0,Tabby,2,2.0
1,265,0,Domestic Medium Hair,2,2.0
2,307,0,Mixed Breed,1,1.0
3,307,0,Mixed Breed,1,1.0
4,307,0,Mixed Breed,1,1.0
...,...,...,...,...,...
14988,266,0,Domestic Short Hair,2,2.0
14989,265,264,Domestic Medium Hair,2,2.0
14990,265,266,Domestic Medium Hair,2,2.0
14991,266,0,Domestic Short Hair,2,2.0


In [31]:
breed_labels.loc[breed_labels['BreedID'] == 307]

Unnamed: 0,BreedID,Type,BreedName
240,307,1,Mixed Breed


In [32]:
merge_breed2 = merge_breed.merge(breed_labels, left_on=['Breed2'], right_on=['BreedID'], how='left') # to keep breed0
merge_breed2.info()
merge_breed2.drop('Type', inplace=True, axis=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Breed2         14993 non-null  int64  
 4   Gender         14993 non-null  int64  
 5   Color1         14993 non-null  int64  
 6   Color2         14993 non-null  int64  
 7   Color3         14993 non-null  int64  
 8   MaturitySize   14993 non-null  int64  
 9   FurLength      14993 non-null  int64  
 10  Vaccinated     14993 non-null  int64  
 11  Dewormed       14993 non-null  int64  
 12  Sterilized     14993 non-null  int64  
 13  Health         14993 non-null  int64  
 14  Quantity       14993 non-null  int64  
 15  Fee            14993 non-null  int64  
 16  State          14993 non-null  int64  
 17  RescuerID      14993 non-null  object 
 18  VideoA

In [33]:
display(merge_breed2[['Breed2', 'BreedName_x', 'BreedName_y', 'Type_x', 'BreedID_x', 'BreedID_y']])

Unnamed: 0,Breed2,BreedName_x,BreedName_y,Type_x,BreedID_x,BreedID_y
0,0,Tabby,,2,299.0,
1,0,Domestic Medium Hair,,2,265.0,
2,0,Mixed Breed,,1,307.0,
3,0,Mixed Breed,,1,307.0,
4,0,Mixed Breed,,1,307.0,
...,...,...,...,...,...,...
14988,0,Domestic Short Hair,,2,266.0,
14989,264,Domestic Medium Hair,Domestic Long Hair,2,265.0,264.0
14990,266,Domestic Medium Hair,Domestic Short Hair,2,265.0,266.0
14991,0,Domestic Short Hair,,2,266.0,


In [34]:
breed_labels.loc[breed_labels['BreedID'] == 264]

Unnamed: 0,BreedID,Type,BreedName
264,264,2,Domestic Long Hair


In [35]:
merge_breed2['BreedName_y'].isna().sum()

10762

In [36]:
merge_breed2['BreedName_x'].isna().sum()

5

In [37]:
# where breed2 is 0 is the same as breed1
merge_breed2['BreedName_y'] = merge_breed2['BreedName_y'].fillna(merge_breed2['BreedName_x'])

In [38]:
display(merge_breed2[['Breed2', 'BreedName_x', 'BreedName_y', 'Type_x', 'BreedID_x', 'BreedID_y']])

Unnamed: 0,Breed2,BreedName_x,BreedName_y,Type_x,BreedID_x,BreedID_y
0,0,Tabby,Tabby,2,299.0,
1,0,Domestic Medium Hair,Domestic Medium Hair,2,265.0,
2,0,Mixed Breed,Mixed Breed,1,307.0,
3,0,Mixed Breed,Mixed Breed,1,307.0,
4,0,Mixed Breed,Mixed Breed,1,307.0,
...,...,...,...,...,...,...
14988,0,Domestic Short Hair,Domestic Short Hair,2,266.0,
14989,264,Domestic Medium Hair,Domestic Long Hair,2,265.0,264.0
14990,266,Domestic Medium Hair,Domestic Short Hair,2,265.0,266.0
14991,0,Domestic Short Hair,Domestic Short Hair,2,266.0,


In [39]:
merge_breed2['BreedName_y'].isna().sum()

0

In [40]:
merge_breed2['BreedID_y'].isna().sum()

10762

In [41]:
merge_breed2.drop(['Breed2', 'BreedID_y', 'BreedID_x'], inplace=True, axis=1)
merge_breed2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   Color1         14993 non-null  int64  
 5   Color2         14993 non-null  int64  
 6   Color3         14993 non-null  int64  
 7   MaturitySize   14993 non-null  int64  
 8   FurLength      14993 non-null  int64  
 9   Vaccinated     14993 non-null  int64  
 10  Dewormed       14993 non-null  int64  
 11  Sterilized     14993 non-null  int64  
 12  Health         14993 non-null  int64  
 13  Quantity       14993 non-null  int64  
 14  Fee            14993 non-null  int64  
 15  State          14993 non-null  int64  
 16  RescuerID      14993 non-null  object 
 17  VideoAmt       14993 non-null  int64  
 18  Descri

In [42]:
merge_breed2[['State']]

Unnamed: 0,State
0,41326
1,41401
2,41326
3,41401
4,41326
...,...
14988,41326
14989,41326
14990,41326
14991,41336


In [43]:
merged_state = merge_breed2.merge(state_labels, left_on='State', right_on='StateID')
merged_state.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   Color1         14993 non-null  int64  
 5   Color2         14993 non-null  int64  
 6   Color3         14993 non-null  int64  
 7   MaturitySize   14993 non-null  int64  
 8   FurLength      14993 non-null  int64  
 9   Vaccinated     14993 non-null  int64  
 10  Dewormed       14993 non-null  int64  
 11  Sterilized     14993 non-null  int64  
 12  Health         14993 non-null  int64  
 13  Quantity       14993 non-null  int64  
 14  Fee            14993 non-null  int64  
 15  State          14993 non-null  int64  
 16  RescuerID      14993 non-null  object 
 17  VideoAmt       14993 non-null  int64  
 18  Descri

In [44]:
merged_state[['StateID', 'StateName', 'State']]

Unnamed: 0,StateID,StateName,State
0,41326,Selangor,41326
1,41326,Selangor,41326
2,41326,Selangor,41326
3,41326,Selangor,41326
4,41326,Selangor,41326
...,...,...,...
14988,41367,Kelantan,41367
14989,41367,Kelantan,41367
14990,41415,Labuan,41415
14991,41415,Labuan,41415


In [45]:
state_labels[state_labels['StateID'] == 41326] 

Unnamed: 0,StateID,StateName
13,41326,Selangor


In [46]:
final_df = merged_state.drop(['StateID', 'State'], axis=1)

In [47]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   Color1         14993 non-null  int64  
 5   Color2         14993 non-null  int64  
 6   Color3         14993 non-null  int64  
 7   MaturitySize   14993 non-null  int64  
 8   FurLength      14993 non-null  int64  
 9   Vaccinated     14993 non-null  int64  
 10  Dewormed       14993 non-null  int64  
 11  Sterilized     14993 non-null  int64  
 12  Health         14993 non-null  int64  
 13  Quantity       14993 non-null  int64  
 14  Fee            14993 non-null  int64  
 15  RescuerID      14993 non-null  object 
 16  VideoAmt       14993 non-null  int64  
 17  Description    14981 non-null  object 
 18  PetID 

In [48]:
final_df[['Color1', 'Color2', 'Color3']]

Unnamed: 0,Color1,Color2,Color3
0,1,7,0
1,2,7,0
2,1,0,0
3,5,6,0
4,1,0,0
...,...,...,...
14988,1,0,0
14989,7,0,0
14990,5,0,0
14991,5,0,0


In [49]:
color_labels = pd.read_csv('data/ColorLabels.csv')
color_labels

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream
5,6,Gray
6,7,White


In [50]:
final_df = final_df.merge(color_labels, left_on='Color1', right_on='ColorID', how='left') # to keep the 0

In [51]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   Color1         14993 non-null  int64  
 5   Color2         14993 non-null  int64  
 6   Color3         14993 non-null  int64  
 7   MaturitySize   14993 non-null  int64  
 8   FurLength      14993 non-null  int64  
 9   Vaccinated     14993 non-null  int64  
 10  Dewormed       14993 non-null  int64  
 11  Sterilized     14993 non-null  int64  
 12  Health         14993 non-null  int64  
 13  Quantity       14993 non-null  int64  
 14  Fee            14993 non-null  int64  
 15  RescuerID      14993 non-null  object 
 16  VideoAmt       14993 non-null  int64  
 17  Description    14981 non-null  object 
 18  PetID 

In [52]:
final_df = final_df.merge(color_labels, left_on='Color2', right_on='ColorID', how='left') # to keep the 0
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   Color1         14993 non-null  int64  
 5   Color2         14993 non-null  int64  
 6   Color3         14993 non-null  int64  
 7   MaturitySize   14993 non-null  int64  
 8   FurLength      14993 non-null  int64  
 9   Vaccinated     14993 non-null  int64  
 10  Dewormed       14993 non-null  int64  
 11  Sterilized     14993 non-null  int64  
 12  Health         14993 non-null  int64  
 13  Quantity       14993 non-null  int64  
 14  Fee            14993 non-null  int64  
 15  RescuerID      14993 non-null  object 
 16  VideoAmt       14993 non-null  int64  
 17  Description    14981 non-null  object 
 18  PetID 

In [53]:
final_df = final_df.merge(color_labels, left_on='Color3', right_on='ColorID', how='left') # to keep the 0
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 30 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   Color1         14993 non-null  int64  
 5   Color2         14993 non-null  int64  
 6   Color3         14993 non-null  int64  
 7   MaturitySize   14993 non-null  int64  
 8   FurLength      14993 non-null  int64  
 9   Vaccinated     14993 non-null  int64  
 10  Dewormed       14993 non-null  int64  
 11  Sterilized     14993 non-null  int64  
 12  Health         14993 non-null  int64  
 13  Quantity       14993 non-null  int64  
 14  Fee            14993 non-null  int64  
 15  RescuerID      14993 non-null  object 
 16  VideoAmt       14993 non-null  int64  
 17  Description    14981 non-null  object 
 18  PetID 

In [54]:
final_df[['Color1', 'Color2', 'Color3', 'ColorID_x', 'ColorName_x', 'ColorID_y', 'ColorName_y', 'ColorID', 'ColorName']]

Unnamed: 0,Color1,Color2,Color3,ColorID_x,ColorName_x,ColorID_y,ColorName_y,ColorID,ColorName
0,1,7,0,1,Black,7.0,White,,
1,2,7,0,2,Brown,7.0,White,,
2,1,0,0,1,Black,,,,
3,5,6,0,5,Cream,6.0,Gray,,
4,1,0,0,1,Black,,,,
...,...,...,...,...,...,...,...,...,...
14988,1,0,0,1,Black,,,,
14989,7,0,0,7,White,,,,
14990,5,0,0,5,Cream,,,,
14991,5,0,0,5,Cream,,,,


In [55]:
final_df.drop(inplace=True, axis=1, columns=['ColorID_x', 'ColorID_y', 'ColorID', 'Color1', 'Color2', 'Color3'])

In [56]:
final_df['ColorName_y'] = \
    final_df['ColorName_y'].fillna(final_df['ColorName_x']) # Color2 when is 0, it's going to be the same as color 1

In [57]:
final_df[['ColorName_x', 'ColorName_y', 'ColorName']]

Unnamed: 0,ColorName_x,ColorName_y,ColorName
0,Black,White,
1,Brown,White,
2,Black,Black,
3,Cream,Gray,
4,Black,Black,
...,...,...,...
14988,Black,Black,
14989,White,White,
14990,Cream,Cream,
14991,Cream,Cream,


In [58]:
final_df['ColorName'] = \
    final_df['ColorName'].fillna(final_df['ColorName_y']) # Color3 when is 0, it's going to be the same as color 2

In [59]:
final_df[['ColorName_x', 'ColorName_y', 'ColorName']]

Unnamed: 0,ColorName_x,ColorName_y,ColorName
0,Black,White,White
1,Brown,White,White
2,Black,Black,Black
3,Cream,Gray,Gray
4,Black,Black,Black
...,...,...,...
14988,Black,Black,Black
14989,White,White,White
14990,Cream,Cream,Cream
14991,Cream,Cream,Cream


In [60]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type_x         14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   MaturitySize   14993 non-null  int64  
 5   FurLength      14993 non-null  int64  
 6   Vaccinated     14993 non-null  int64  
 7   Dewormed       14993 non-null  int64  
 8   Sterilized     14993 non-null  int64  
 9   Health         14993 non-null  int64  
 10  Quantity       14993 non-null  int64  
 11  Fee            14993 non-null  int64  
 12  RescuerID      14993 non-null  object 
 13  VideoAmt       14993 non-null  int64  
 14  Description    14981 non-null  object 
 15  PetID          14993 non-null  object 
 16  PhotoAmt       14993 non-null  float64
 17  AdoptionSpeed  14993 non-null  int64  
 18  BreedN

In [61]:
final_df.rename(columns={
    'Type_x' : 'Type',
    'BreedName_x' : "Breed1",
    'BreedName_y' : "Breed2",
    'ColorName_x' : "Color1",
    'ColorName_y' : "Color2",
    'ColorName'   : "Color3"
}, inplace=True)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14993 entries, 0 to 14992
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type           14993 non-null  int64  
 1   Name           13736 non-null  object 
 2   Age            14993 non-null  int64  
 3   Gender         14993 non-null  int64  
 4   MaturitySize   14993 non-null  int64  
 5   FurLength      14993 non-null  int64  
 6   Vaccinated     14993 non-null  int64  
 7   Dewormed       14993 non-null  int64  
 8   Sterilized     14993 non-null  int64  
 9   Health         14993 non-null  int64  
 10  Quantity       14993 non-null  int64  
 11  Fee            14993 non-null  int64  
 12  RescuerID      14993 non-null  object 
 13  VideoAmt       14993 non-null  int64  
 14  Description    14981 non-null  object 
 15  PetID          14993 non-null  object 
 16  PhotoAmt       14993 non-null  float64
 17  AdoptionSpeed  14993 non-null  int64  
 18  Breed1

In [62]:
final_df.head()

Unnamed: 0,Type,Name,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,...,Description,PetID,PhotoAmt,AdoptionSpeed,Breed1,Breed2,StateName,Color1,Color2,Color3
0,2,Nibble,3,1,1,1,2,2,2,1,...,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,Tabby,Tabby,Selangor,Black,White,White
1,1,Brisco,1,1,2,2,1,1,2,1,...,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,Mixed Breed,Mixed Breed,Selangor,Brown,White,White
2,1,Hunter,1,1,2,1,2,2,2,1,...,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,Mixed Breed,Mixed Breed,Selangor,Black,Black,Black
3,2,,3,2,2,1,2,2,2,1,...,This is a stray kitten that came to my house. ...,d24c30b4b,2.0,2,Domestic Short Hair,Domestic Short Hair,Selangor,Cream,Gray,Gray
4,2,BULAT,12,1,2,3,2,2,3,1,...,anyone within the area of ipoh or taiping who ...,1caa6fcdb,3.0,1,Domestic Long Hair,Domestic Long Hair,Selangor,Black,Black,Black
