In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

sns.set_style('whitegrid')

# Problem definition

Predict when a pet will be adopted

# Load the data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape, df_test.shape)

(10000, 24) (4993, 23)


In [3]:
#Setting number of training lines
training_lines = len(df_train)

In [4]:
# create a new dataframe (df_dataset) by concatenating training and test
df_dataset = pd.concat(objs=[df_train, df_test], axis=0)

In [5]:
df_dataset.dtypes

AdoptionSpeed    float64
Age                int64
Breed1             int64
Breed2             int64
Color1             int64
Color2             int64
Color3             int64
Description       object
Dewormed           int64
Fee                int64
FurLength          int64
Gender             int64
Health             int64
MaturitySize       int64
Name              object
PetID             object
PhotoAmt         float64
Quantity           int64
RescuerID         object
State              int64
Sterilized         int64
Type               int64
Vaccinated         int64
VideoAmt           int64
dtype: object

# Feature Engineering

In [6]:
print(df_train.columns)
df_dataset.head()

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


Unnamed: 0,AdoptionSpeed,Age,Breed1,Breed2,Color1,Color2,Color3,Description,Dewormed,Fee,...,Name,PetID,PhotoAmt,Quantity,RescuerID,State,Sterilized,Type,Vaccinated,VideoAmt
0,4.0,36,307,0,2,7,0,Adorable 3 year old Lily looking for a forever...,2,0,...,â¥â¥â¥ Lily â¥â¥â¥,3f8824a3b,1.0,1,337914b09c2fa5460e195197e994ef98,41326,1,1,1,0
1,2.0,3,266,0,6,7,0,i rescue this stary kitten from market near my...,1,0,...,Cookie,9238eb7fc,1.0,1,4bb1ebb92158078ad54a6bb23c10dffc,41327,2,2,2,0
2,4.0,7,250,252,1,2,0,The mother was a Burmese cross and had since p...,1,0,...,Favour Speedy Abundance And Courage,f0a1f2b90,2.0,4,99ba8ce53b4d8515e417e7921563d923,41327,2,2,1,0
3,2.0,3,307,0,2,0,0,This puppy is: 1. Male 2. 3 months old 3. Brow...,2,0,...,,7d028bdea,4.0,1,3f3ef74c486beba3bc87f6dbaee772bf,41327,2,1,2,0
4,2.0,1,266,0,1,6,7,Mother cat gave birth to a litter of 3 and too...,2,0,...,Abandoned Kitty,8377bfe97,0.0,1,844f03ab8054007d4be6686f3a9702b9,41401,2,2,2,0


In [7]:
# Check for missing values
df_dataset.isnull().sum(axis = 0)

AdoptionSpeed    4993
Age                 0
Breed1              0
Breed2              0
Color1              0
Color2              0
Color3              0
Description        12
Dewormed            0
Fee                 0
FurLength           0
Gender              0
Health              0
MaturitySize        0
Name             1257
PetID               0
PhotoAmt            0
Quantity            0
RescuerID           0
State               0
Sterilized          0
Type                0
Vaccinated          0
VideoAmt            0
dtype: int64

In [8]:
df_dataset.dtypes

AdoptionSpeed    float64
Age                int64
Breed1             int64
Breed2             int64
Color1             int64
Color2             int64
Color3             int64
Description       object
Dewormed           int64
Fee                int64
FurLength          int64
Gender             int64
Health             int64
MaturitySize       int64
Name              object
PetID             object
PhotoAmt         float64
Quantity           int64
RescuerID         object
State              int64
Sterilized         int64
Type               int64
Vaccinated         int64
VideoAmt           int64
dtype: object

In [9]:
# df_dataset['PhotoAmt']=df_dataset['PhotoAmt'].astype(int)
# df_dataset['VideoAmt']=df_dataset['VideoAmt'].astype(int)

In [10]:
# apply dummies on the training set
for col in ['Health','Breed1','Sterilized','Vaccinated','Dewormed','MaturitySize','Type','Breed2','Color1','Color2','Color3','State']:
    df_dummies = pd.get_dummies(df_dataset[col])
    df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
    df_dataset = pd.concat([df_dataset, df_dummies], axis=1)
    del df_dataset[col]

# # apply the same dummies on the test set
# for col in ['Health','Breed1']:
#     df_dummies = pd.get_dummies(df_test[col])
#     df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
#     df_test = pd.concat([df_test, df_dummies], axis=1)

In [11]:
pd.options.display.max_seq_items = 2000


df_dataset.columns

Index(['AdoptionSpeed', 'Age', 'Description', 'Fee', 'FurLength', 'Gender',
       'Name', 'PetID', 'PhotoAmt', 'Quantity', 'RescuerID', 'VideoAmt',
       'Health_1', 'Health_2', 'Health_3', 'Breed1_0', 'Breed1_1', 'Breed1_3',
       'Breed1_5', 'Breed1_7', 'Breed1_10', 'Breed1_11', 'Breed1_15',
       'Breed1_16', 'Breed1_17', 'Breed1_18', 'Breed1_19', 'Breed1_20',
       'Breed1_21', 'Breed1_23', 'Breed1_24', 'Breed1_25', 'Breed1_26',
       'Breed1_31', 'Breed1_32', 'Breed1_39', 'Breed1_42', 'Breed1_44',
       'Breed1_49', 'Breed1_50', 'Breed1_56', 'Breed1_58', 'Breed1_60',
       'Breed1_61', 'Breed1_64', 'Breed1_65', 'Breed1_69', 'Breed1_70',
       'Breed1_71', 'Breed1_72', 'Breed1_75', 'Breed1_76', 'Breed1_78',
       'Breed1_81', 'Breed1_82', 'Breed1_83', 'Breed1_85', 'Breed1_88',
       'Breed1_93', 'Breed1_97', 'Breed1_98', 'Breed1_99', 'Breed1_100',
       'Breed1_102', 'Breed1_103', 'Breed1_105', 'Breed1_108', 'Breed1_109',
       'Breed1_111', 'Breed1_114', 'Breed1_117',

In [12]:
df_train = df_dataset[:training_lines]
df_test = df_dataset[training_lines:]

In [13]:
# select the columns
# X_columns = ['Age', 'Fee', 'Health_1','Dewormed_1','Vaccinated_1','FurLength','Sterilized_1']+ list(df_dataset.loc[:, df_train.columns.str.startswith('Breed1')])
X_columns = [x for x in df_dataset.columns if (x != 'AdoptionSpeed') & (x != 'PetID')& (x != 'RescuerID') & (x != 'Name') & (x != 'Description')]

# X_columns = [ x for x in df_train.columns.values if(x != "log_price") & (x != "id") & (x not in exclude_columns)]
y_column = ['AdoptionSpeed']

In [14]:
X_columns

['Age',
 'Fee',
 'FurLength',
 'Gender',
 'PhotoAmt',
 'Quantity',
 'VideoAmt',
 'Health_1',
 'Health_2',
 'Health_3',
 'Breed1_0',
 'Breed1_1',
 'Breed1_3',
 'Breed1_5',
 'Breed1_7',
 'Breed1_10',
 'Breed1_11',
 'Breed1_15',
 'Breed1_16',
 'Breed1_17',
 'Breed1_18',
 'Breed1_19',
 'Breed1_20',
 'Breed1_21',
 'Breed1_23',
 'Breed1_24',
 'Breed1_25',
 'Breed1_26',
 'Breed1_31',
 'Breed1_32',
 'Breed1_39',
 'Breed1_42',
 'Breed1_44',
 'Breed1_49',
 'Breed1_50',
 'Breed1_56',
 'Breed1_58',
 'Breed1_60',
 'Breed1_61',
 'Breed1_64',
 'Breed1_65',
 'Breed1_69',
 'Breed1_70',
 'Breed1_71',
 'Breed1_72',
 'Breed1_75',
 'Breed1_76',
 'Breed1_78',
 'Breed1_81',
 'Breed1_82',
 'Breed1_83',
 'Breed1_85',
 'Breed1_88',
 'Breed1_93',
 'Breed1_97',
 'Breed1_98',
 'Breed1_99',
 'Breed1_100',
 'Breed1_102',
 'Breed1_103',
 'Breed1_105',
 'Breed1_108',
 'Breed1_109',
 'Breed1_111',
 'Breed1_114',
 'Breed1_117',
 'Breed1_119',
 'Breed1_122',
 'Breed1_123',
 'Breed1_125',
 'Breed1_128',
 'Breed1_129',
 'B

# Model Training

In [15]:
# split the data using sklearn

threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (8000, 370)
y_train (8000, 1)
X_test (2000, 370)
y_test (2000, 1)


In [16]:
# train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=150)
model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)

# Model Evaluation

In [17]:
kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

kappa 0.2994
[[  1  18  14   9  14]
 [  5 145 111  40  84]
 [  5 116 165 101 152]
 [  3  68 117 124 132]
 [  3  69  93  75 336]]


Using Cross Validation

In [18]:
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

Kappa for each fold: [0.3152, 0.3613, 0.3102, 0.2893, 0.324, 0.304, 0.2764, 0.2881, 0.3466, 0.2909]
AVG(kappa) 0.3106
STD(kappa) 0.0257


# Prepare submission

In [19]:
df_prediction = df_test[X_columns]
df_test['AdoptionSpeed'] = model.predict(df_prediction).astype(int)
df_test[['PetID', 'AdoptionSpeed']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,2
3,5335bfb38,2
4,ff2cf88a0,3
5,1d13441b9,3
6,7d835cf7c,2
7,577d15fea,4
8,91736f444,4
9,db194aec8,1


In [20]:
df_test[['PetID','AdoptionSpeed']].to_csv('Alka_submission_16.csv', index=False)