# Pet Adoption Speed - Classification

In [1]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

________________________
# Load Data

In [2]:
pets_df = pd.read_csv('data/train.csv')
pets_test_df = pd.read_csv('data/test.csv')

In [3]:
pets_df.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [4]:
pets_df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,1,â¥â¥â¥ Lily â¥â¥â¥,36,307,0,2,2,7,0,2,...,1,1,0,41326,337914b09c2fa5460e195197e994ef98,0,Adorable 3 year old Lily looking for a forever...,3f8824a3b,1.0,4
1,2,Cookie,3,266,0,1,6,7,0,2,...,1,1,0,41327,4bb1ebb92158078ad54a6bb23c10dffc,0,i rescue this stary kitten from market near my...,9238eb7fc,1.0,2
2,2,Favour Speedy Abundance And Courage,7,250,252,1,1,2,0,2,...,1,4,0,41327,99ba8ce53b4d8515e417e7921563d923,0,The mother was a Burmese cross and had since p...,f0a1f2b90,2.0,4
3,1,,3,307,0,1,2,0,0,3,...,1,1,0,41327,3f3ef74c486beba3bc87f6dbaee772bf,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,7d028bdea,4.0,2
4,2,Abandoned Kitty,1,266,0,1,1,6,7,1,...,1,1,0,41401,844f03ab8054007d4be6686f3a9702b9,0,Mother cat gave birth to a litter of 3 and too...,8377bfe97,0.0,2


In [5]:
state_df = pd.read_csv('data/state_labels.csv')
breed_df = pd.read_csv('data/breed_labels.csv')
color_df = pd.read_csv('data/color_labels.csv')

In [6]:
state_df.head()

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan


In [7]:
breed_df.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [8]:
color_df.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


____________________________
# Data Cleaning

In [9]:
# Replace int numbers with meaningful strings.

def change_values(df):
    
    yes_no_columns = ['Vaccinated', 'Dewormed', 'Sterilized']
    df[yes_no_columns] = df[yes_no_columns].replace(to_replace = [1,2,3], value=['Yes', 'No', 'Not_Sure'])
                                                    
    df['Gender'] = df['Gender'].replace(to_replace = [1,2,3], value=['Male','Female','Mixed'])
    df['Type'] = df['Type'].replace(to_replace = [1,2], value=['Dog', 'Cat'])
    df['FurLength'] =  df['FurLength'].replace(to_replace = [1,2,3,0], value=['Short','Medium','Long','Not_Specified'])
    
    df['MaturitySize'] = df['MaturitySize'].replace(to_replace = [1,2,3,4,0],
                                                    value=['Small', 'Medium','Large','Extra_Large','Not_Specified'])
    df['Health'] = df['Health'].replace(to_replace = [1,2,3,0], value=['Healthy', 'Minor_Injury', 'Serious_Injury','Not_Specified'])

In [10]:
change_values(pets_df)
pets_df[['Vaccinated', 'Dewormed', 'Sterilized', 'Type', 'Gender', 'Health', 'MaturitySize', 'FurLength']].head(10)

Unnamed: 0,Vaccinated,Dewormed,Sterilized,Type,Gender,Health,MaturitySize,FurLength
0,Yes,No,Yes,Dog,Female,Healthy,Medium,Medium
1,No,Yes,No,Cat,Male,Healthy,Medium,Short
2,Yes,Yes,No,Cat,Male,Healthy,Medium,Short
3,No,No,No,Dog,Male,Healthy,Large,Short
4,No,No,No,Cat,Male,Healthy,Small,Short
5,No,Yes,No,Dog,Male,Healthy,Medium,Short
6,No,Yes,No,Dog,Female,Healthy,Small,Medium
7,Yes,Yes,Yes,Dog,Female,Healthy,Medium,Short
8,No,No,No,Cat,Female,Minor_Injury,Small,Short
9,Yes,Yes,No,Dog,Male,Healthy,Small,Short


In [11]:
change_values(pets_test_df)
pets_test_df[['Vaccinated', 'Dewormed', 'Sterilized', 'Type', 'Gender','Health', 'MaturitySize', 'FurLength']].head(10)

Unnamed: 0,Vaccinated,Dewormed,Sterilized,Type,Gender,Health,MaturitySize,FurLength
0,No,No,No,Cat,Male,Healthy,Medium,Short
1,Yes,Yes,No,Dog,Female,Healthy,Small,Short
2,Not_Sure,Yes,No,Dog,Female,Healthy,Medium,Short
3,Yes,Yes,No,Dog,Male,Healthy,Medium,Medium
4,Yes,Yes,Yes,Dog,Male,Healthy,Medium,Short
5,No,No,No,Cat,Female,Healthy,Small,Short
6,No,Yes,No,Cat,Mixed,Healthy,Small,Short
7,Yes,Yes,No,Dog,Female,Healthy,Medium,Medium
8,No,No,No,Dog,Mixed,Healthy,Medium,Short
9,No,No,No,Cat,Mixed,Healthy,Small,Short


_________________________
# Merging

In [12]:
def merge_colors(pets, colors):

    df_merge = pets.copy()
    
    df_merge = pd.merge(pets, colors, left_on='Color1', right_on='ColorID', how='left')
    df_merge.rename(columns={'ColorID':'Color1_ID', 'ColorName':'Color1_Name'}, inplace=True)
    
    df_merge = pd.merge(df_merge, colors, left_on='Color2', right_on='ColorID', how='left')
    df_merge.rename(columns={'ColorID':'Color2_ID', 'ColorName':'Color2_Name'}, inplace=True)
    
    df_merge = pd.merge(df_merge, colors, left_on='Color3', right_on='ColorID', how='left')
    df_merge.rename(columns={'ColorID':'Color3_ID', 'ColorName':'Color3_Name'}, inplace=True)
    
    df_merge = df_merge.drop(columns = ['Color1', 'Color2', 'Color3'])
    
    return df_merge

In [13]:
df = merge_colors(pets_df,color_df)
df_t = merge_colors(pets_test_df, color_df)

In [14]:
def merge_breed(pets, breeds):
    
    breeds = breeds.drop(columns = 'Type')
    df_merge = pets.copy()
    
    df_merge = pd.merge(pets, breeds, left_on='Breed1', right_on='BreedID', how='left')
    df_merge.rename(columns={'BreedID':'Main_Breed_ID', 'BreedName':'Main_Breed_Name'}, inplace=True)
    
    df_merge = pd.merge(df_merge, breeds, left_on='Breed2', right_on='BreedID', how='left')
    df_merge.rename(columns={'BreedID':'Second_Breed_ID', 'BreedName':'Second_Breed_Name'}, inplace=True)
    
    df_merge = df_merge.drop(columns = ['Breed1', 'Breed2'])
    
    return df_merge

In [15]:
df = merge_breed(df, breed_df)
df_t = merge_breed(df_t, breed_df)

In [16]:
def merge_state(pets, states):
    
    df_merge = pets.copy()
    
    df_merge = pd.merge(pets, states, left_on='State', right_on='StateID', how='left')    
    df_merge = df_merge.drop(columns = ['State'])
    
    return df_merge

In [17]:
merged_df = merge_state(df, state_df)
merged_df_test = merge_state(df_t, state_df)

_______________
# Missing Values

In [18]:
merged_df.isnull().sum()

Type                    0
Name                  842
Age                     0
Gender                  0
MaturitySize            0
FurLength               0
Vaccinated              0
Dewormed                0
Sterilized              0
Health                  0
Quantity                0
Fee                     0
RescuerID               0
VideoAmt                0
Description             8
PetID                   0
PhotoAmt                0
AdoptionSpeed           0
Color1_ID               0
Color1_Name             0
Color2_ID            2960
Color2_Name          2960
Color3_ID            7080
Color3_Name          7080
Main_Breed_ID           2
Main_Breed_Name         2
Second_Breed_ID      7212
Second_Breed_Name    7212
StateID                 0
StateName               0
dtype: int64

In [19]:
merged_df_test.isnull().sum()

Type                    0
Name                  415
Age                     0
Gender                  0
MaturitySize            0
FurLength               0
Vaccinated              0
Dewormed                0
Sterilized              0
Health                  0
Quantity                0
Fee                     0
RescuerID               0
VideoAmt                0
Description             4
PetID                   0
PhotoAmt                0
Color1_ID               0
Color1_Name             0
Color2_ID            1511
Color2_Name          1511
Color3_ID            3524
Color3_Name          3524
Main_Breed_ID           3
Main_Breed_Name         3
Second_Breed_ID      3550
Second_Breed_Name    3550
StateID                 0
StateName               0
dtype: int64

In [20]:
# Fill missing values in colors:

def colors_fill_mv(df):
    
    # Put an ID = -1 --> No color
    df[['Color2_ID', 'Color3_ID']] = df[['Color2_ID', 'Color3_ID']].fillna(-1)
    df[['Color2_Name', 'Color3_Name']] = df[['Color2_Name', 'Color3_Name']].fillna('No_Color')
    
    return df

In [21]:
merged_df = colors_fill_mv(merged_df)
merged_df_test = colors_fill_mv(merged_df_test)

In [22]:
# Fill missing values in breeds:

def breeds_fill_mv(df):
    
    # Put an ID = -1 --> No Breed
    df[['Main_Breed_ID', 'Second_Breed_ID']] = df[['Main_Breed_ID', 'Second_Breed_ID']].fillna(-1)
    df[['Main_Breed_Name', 'Second_Breed_Name']] = df[['Main_Breed_Name', 'Second_Breed_Name']].fillna('No_Breed')
    
    return df

In [23]:
merged_df = breeds_fill_mv(merged_df)
merged_df_test = breeds_fill_mv(merged_df_test)

In [24]:
merged_df.isnull().sum()

Type                   0
Name                 842
Age                    0
Gender                 0
MaturitySize           0
FurLength              0
Vaccinated             0
Dewormed               0
Sterilized             0
Health                 0
Quantity               0
Fee                    0
RescuerID              0
VideoAmt               0
Description            8
PetID                  0
PhotoAmt               0
AdoptionSpeed          0
Color1_ID              0
Color1_Name            0
Color2_ID              0
Color2_Name            0
Color3_ID              0
Color3_Name            0
Main_Breed_ID          0
Main_Breed_Name        0
Second_Breed_ID        0
Second_Breed_Name      0
StateID                0
StateName              0
dtype: int64

__________________
# Feature Engineering

In [25]:
def name_columns(df):
    
    #Create new feature, default has_name = True
    df['has_name'] = True
    
    for idx in df.index[df['Name'].isnull()]:
            df.at[idx,'has_name'] = False
            
    return df 

In [26]:
newdf = name_columns(merged_df)
newdf_test = name_columns(merged_df_test)

In [27]:
def description_columns(df):
    
    #Create new feature, default has_description = True
    df['has_description'] = True
    
    for idx in df.index[df['Description'].isnull()]:
            df.at[idx,'has_description'] = False
            
    return df 

In [28]:
newdf = description_columns(newdf)
newdf_test = description_columns(newdf_test)

In [29]:
def name_letters(df):
    
    df['letters_morethan2'] = True
    
    for idx in df.index[df['has_name'] == True]:
        
        if (len(df.loc[idx, 'Name']) <= 2):
            df.at[idx,'letters_morethan2'] = False
            
    return df

In [30]:
newdf = name_letters(newdf)
newdf_test = name_letters(newdf_test)

In [31]:
newdf[newdf['letters_morethan2'] == False].head()

Unnamed: 0,Type,Name,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,...,Color3_Name,Main_Breed_ID,Main_Breed_Name,Second_Breed_ID,Second_Breed_Name,StateID,StateName,has_name,has_description,letters_morethan2
82,Cat,Am,3,Male,Small,Long,No,Yes,No,Healthy,...,No_Color,285.0,Persian,265.0,Domestic Medium Hair,41326,Selangor,True,True,False
120,Dog,QQ,36,Female,Large,Short,Not_Sure,Not_Sure,Not_Sure,Healthy,...,No_Color,189.0,Rottweiler,307.0,Mixed Breed,41326,Selangor,True,True,False
126,Dog,JJ,8,Male,Small,Short,Yes,Yes,Yes,Healthy,...,White,128.0,Jack Russell Terrier,-1.0,No_Breed,41401,Kuala Lumpur,True,True,False
199,Dog,Rt,7,Male,Medium,Short,Yes,Yes,Not_Sure,Healthy,...,Yellow,307.0,Mixed Breed,307.0,Mixed Breed,41336,Johor,True,True,False
234,Dog,B2,2,Female,Medium,Medium,No,Yes,No,Healthy,...,No_Color,307.0,Mixed Breed,307.0,Mixed Breed,41326,Selangor,True,True,False


In [32]:
# get_dummies

def get_dum(df):
    categorical = ['Gender','Sterilized','Vaccinated', 'Type','Dewormed', 'FurLength', 'MaturitySize', 'Health',
                   'Color1_Name', 'Color2_Name', 'Color3_Name', 'Main_Breed_Name', 'Second_Breed_Name', 'StateName']
    
    df_dummies = pd.get_dummies(df[categorical])
    new_df = pd.concat([df_dummies, df], axis =1)    
    
    return new_df, df_dummies

In [33]:
newdf_dum, dummies = get_dum(newdf)
dummies.columns

Index(['Gender_Female', 'Gender_Male', 'Gender_Mixed', 'Sterilized_No',
       'Sterilized_Not_Sure', 'Sterilized_Yes', 'Vaccinated_No',
       'Vaccinated_Not_Sure', 'Vaccinated_Yes', 'Type_Cat',
       ...
       'StateName_Labuan', 'StateName_Melaka', 'StateName_Negeri Sembilan',
       'StateName_Pahang', 'StateName_Perak', 'StateName_Pulau Pinang',
       'StateName_Sabah', 'StateName_Sarawak', 'StateName_Selangor',
       'StateName_Terengganu'],
      dtype='object', length=341)

In [34]:
newdf_test_dum, test_dummies = get_dum(newdf_test)
test_dummies.columns

Index(['Gender_Female', 'Gender_Male', 'Gender_Mixed', 'Sterilized_No',
       'Sterilized_Not_Sure', 'Sterilized_Yes', 'Vaccinated_No',
       'Vaccinated_Not_Sure', 'Vaccinated_Yes', 'Type_Cat',
       ...
       'StateName_Labuan', 'StateName_Melaka', 'StateName_Negeri Sembilan',
       'StateName_Pahang', 'StateName_Perak', 'StateName_Pulau Pinang',
       'StateName_Sabah', 'StateName_Sarawak', 'StateName_Selangor',
       'StateName_Terengganu'],
      dtype='object', length=288)

_____________
# Train Models

In [122]:
# select the columns

unique_dum = list(dummies.columns & test_dummies.columns)

X_columns = ['Age', 'Fee', 'Quantity', 'PhotoAmt','has_name','has_description','letters_morethan2'] + unique_dum
#X_columns = feature_imp[0].values

y_column = ['AdoptionSpeed']

In [123]:
len(X_columns)

267

In [124]:
# split the data using sklearn

df_train = newdf_dum.copy()

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True, random_state =5)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (8000, 267)
y_train (8000, 1)
X_test (2000, 267)
y_test (2000, 1)


In [None]:
# train a KNN Classifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train.values.ravel())
knn_pred = knn_model.predict(X_test)

In [114]:

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train.values.ravel())
gnb_pred = gnb_model.predict(X_test)


In [112]:

rf_model = RandomForestClassifier(150)
rf_model.fit(X_train, y_train.values.ravel())
rf_pred = rf_model.predict(X_test)


In [113]:

gb_model = GradientBoostingClassifier(n_estimators=150)
gb_model.fit(X_train, y_train.values.ravel())
gb_pred = gb_model.predict(X_test)


_____________
# Model Evaluation

In [115]:
knn_kappa = cohen_kappa_score(y_test, knn_pred, weights ='quadratic')
print('kappa', round(knn_kappa, 4))
print(confusion_matrix(y_test, knn_pred))

kappa 0.1966
[[  4  17   4   7  13]
 [  5 118 151  57  71]
 [  7 136 197  98  91]
 [  3  93 134 124 104]
 [  5  90 170  78 223]]


In [116]:
gnb_kappa = cohen_kappa_score(y_test, gnb_pred, weights ='quadratic')
print('kappa', round(gnb_kappa, 4))
print(confusion_matrix(y_test, gnb_pred))

kappa 0.0029
[[ 38   1   1   1   4]
 [365   6   8  14   9]
 [482  18   5  16   8]
 [412  19   4  13  10]
 [511   8   7  14  26]]


In [117]:
rf_kappa = cohen_kappa_score(y_test, rf_pred, weights ='quadratic')
print('kappa', round(rf_kappa, 4))
print(confusion_matrix(y_test, rf_pred))

kappa 0.307
[[  3  12   9   7  14]
 [  2 135 117  64  84]
 [  1 135 189  71 133]
 [  1  67 117 112 161]
 [  0  65 103  57 341]]


In [118]:
gb_kappa = cohen_kappa_score(y_test, gb_pred, weights ='quadratic')
print('kappa', round(gb_kappa, 4))
print(confusion_matrix(y_test, gb_pred))

kappa 0.3396
[[  3  18  15   2   7]
 [  4 102 168  30  98]
 [  2  90 239  56 142]
 [  2  69 148  93 146]
 [  1  43 112  35 375]]


In [50]:
# Cross Validation

def cv(model):
    k = 7
    results = []
    kf = KFold(n_splits=k)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        model.fit(X_train, y_train.ravel())
        y_pred = model.predict(X_test)
        kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
        results.append(round(kappa, 4))

    print('Kappa for each fold:', results)
    print('AVG(kappa)', round(np.mean(results), 4))
    print('STD(kappa)', round(np.std(results), 4))

In [None]:
cv(knn_model)
cv(rf_model)
cv(gnb_model)
cv(gb_model)

In [75]:

model = gb_model
fi = []

for i, col in enumerate(X_test.columns):
    fi.append([col, model.feature_importances_[i]])
    
feature_imp = pd.DataFrame(fi).sort_values(1, ascending=False)
feature_imp

Unnamed: 0,0,1
0,Age,0.19
1,PhotoAmt,0.12
2,Main_Breed_Name_Mixed Breed,0.09
3,Quantity,0.06
4,Sterilized_No,0.04
5,Fee,0.03
6,Sterilized_Yes,0.02
7,Type_Cat,0.02
8,MaturitySize_Small,0.01
10,StateName_Selangor,0.01


In [76]:
feature_imp = feature_imp[feature_imp[1] >= 0.002]
feature_imp

Unnamed: 0,0,1
0,Age,0.19
1,PhotoAmt,0.12
2,Main_Breed_Name_Mixed Breed,0.09
3,Quantity,0.06
4,Sterilized_No,0.04
5,Fee,0.03
6,Sterilized_Yes,0.02
7,Type_Cat,0.02
8,MaturitySize_Small,0.01
10,StateName_Selangor,0.01


__________________
# Submission

In [119]:
X_train = df_train[X_columns]
y_train = df_train[y_column]

df_prediction = newdf_test_dum[X_columns]

In [None]:

rf2_model = RandomForestClassifier(150)
rf2_model.fit(X_train, y_train.values.ravel())
newdf_test['AdoptionSpeed'] = rf2_model.predict(df_prediction)


In [120]:

gb2_model = GradientBoostingClassifier(n_estimators=200)
gb2_model.fit(X_train, y_train.values.ravel())
newdf_test['AdoptionSpeed'] = gb2_model.predict(df_prediction)


In [121]:
newdf_test[['PetID', 'AdoptionSpeed']].to_csv('submission_v8.csv', index=False)