In [458]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [459]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [460]:
print(train_df.shape)
print(test_df.shape)

(18834, 11)
(8072, 9)


In [461]:
train_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [462]:
train_df.isnull().sum().sort_values(ascending=False).head(5)

condition         1477
pet_category         0
breed_category       0
X2                   0
X1                   0
dtype: int64

In [463]:
test_df.isnull().sum().sort_values(ascending=False).head(5)

condition     619
X2              0
X1              0
height(cm)      0
length(m)       0
dtype: int64

In [464]:
train_df.describe()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category
count,17357.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0
mean,0.88339,0.502636,27.448832,5.369598,4.577307,0.600563,1.709143
std,0.770434,0.288705,13.019781,6.572366,3.517763,0.629883,0.717919
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0
25%,0.0,0.25,16.1725,0.0,1.0,0.0,1.0
50%,1.0,0.5,27.34,0.0,4.0,1.0,2.0
75%,1.0,0.76,38.89,13.0,9.0,1.0,2.0
max,2.0,1.0,50.0,19.0,9.0,2.0,4.0


In [465]:
print(len(train_df["color_type"].value_counts()))
print(len(test_df["color_type"].value_counts()))

56
54


In [466]:
train_df['condition'].value_counts()

1.0    6819
0.0    6281
2.0    4257
Name: condition, dtype: int64

In [467]:
test_df['condition'].value_counts()

1.0    2928
0.0    2685
2.0    1840
Name: condition, dtype: int64

In [468]:
train_df['condition']=train_df['condition'].fillna(train_df['condition'].mode()[0])
test_df['condition']=test_df['condition'].fillna(test_df['condition'].mode()[0])

In [469]:
train_df['condition'].value_counts()

1.0    8296
0.0    6281
2.0    4257
Name: condition, dtype: int64

In [470]:
color_data=train_df['color_type'].value_counts().to_dict()

In [471]:
train_df['color_type']=train_df['color_type'].map(color_data)
test_df['color_type']=test_df['color_type'].map(color_data)

In [472]:
train_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,1687,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,2453,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,1.0,1791,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,2453,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,4620,0.5,11.06,18,4,0.0,1


In [473]:
train_df.shape

(18834, 11)

In [474]:
train_df=train_df.drop(['pet_id','issue_date','listing_date'],axis=1)
test_df = test_df.drop(['pet_id','issue_date','listing_date'],axis=1)
# Y_type = train_df[['pet_category']]
# Y_breed = train_df[['breed_category']]

In [475]:
train_df['breed_category'].value_counts()

0.0    9000
1.0    8357
2.0    1477
Name: breed_category, dtype: int64

In [476]:
train_df['pet_category'].value_counts()

2    10621
1     7184
4      941
0       88
Name: pet_category, dtype: int64

In [477]:
X_train=train_df[['condition','color_type','length(m)','height(cm)','X1','X2']]
y_train=train_df[['breed_category','pet_category']]

In [478]:
X_train.describe()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2
count,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0
mean,0.892535,2025.921525,0.502636,27.448832,5.369598,4.577307
std,0.74027,1642.236246,0.288705,13.019781,6.572366,3.517763
min,0.0,1.0,0.0,5.0,0.0,0.0
25%,0.0,496.0,0.25,16.1725,0.0,1.0
50%,1.0,1687.0,0.5,27.34,0.0,4.0
75%,1.0,2453.0,0.76,38.89,13.0,9.0
max,2.0,4620.0,1.0,50.0,19.0,9.0


In [479]:
X_train

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2
0,2.0,1687,0.80,7.78,13,9
1,1.0,2453,0.72,14.19,13,9
2,1.0,1791,0.15,40.90,15,4
3,1.0,2453,0.62,17.82,0,1
4,2.0,4620,0.50,11.06,18,4
...,...,...,...,...,...,...
18829,2.0,469,0.44,27.36,0,1
18830,1.0,1791,0.73,14.25,15,4
18831,0.0,12,0.99,28.13,13,9
18832,0.0,1349,0.55,44.82,13,9


In [480]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(test_df)

In [481]:
type(X_train)

numpy.ndarray

In [482]:
train_X,test_X,train_y,test_y=train_test_split(X_train,y_train,test_size=0.2,random_state=42)

In [483]:
test_y = np.array(test_y)
test_y.shape

(3767, 2)

### Random Forest Classifier

In [485]:
rf_model=RandomForestClassifier()

In [486]:
rf_model.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [487]:
pred_y = rf_model.predict(test_X)

In [488]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_breed_category = confusion_matrix(test_y[:,0], pred_y[:,0])
acc_breed_category = accuracy_score(test_y[:,0], pred_y[:,0])
print(cm_breed_category)
print(acc_breed_category)

[[1574  166   54]
 [ 235 1438    0]
 [  99    0  201]]
0.8529333687284311


In [489]:
cm_pet_category = confusion_matrix(test_y[:,1], pred_y[:,1])
acc_pet_category = accuracy_score(test_y[:,1], pred_y[:,1])
print(cm_pet_category)
print(acc_pet_category)

[[   1    6    8    7]
 [   0 1100  294    7]
 [   4  315 1823    8]
 [   0   23   33  138]]
0.8128484204937616


### XGBoost Classifier 

In [502]:
train_y

Unnamed: 0,breed_category,pet_category
4501,0.0,2
12204,0.0,1
10675,0.0,1
11146,2.0,1
1664,0.0,2
...,...,...
11284,0.0,1
11964,0.0,2
5390,1.0,2
860,1.0,1


In [503]:
clf_breed = XGBClassifier()
clf_breed.fit(train_X, train_y.iloc[:,0].values)
y_pred_breed = clf_breed.predict(test_X)

clf_type = XGBClassifier()
clf_type.fit(train_X, train_y.iloc[:,1].values)
y_pred_type = clf_type.predict(test_X)

In [504]:
cm_breed_category = confusion_matrix(test_y[:,0], y_pred_breed)
acc_breed_category = accuracy_score(test_y[:,0], y_pred_breed)
print(cm_breed_category)
print(acc_breed_category)

[[1565  200   29]
 [ 211 1462    0]
 [ 102    0  198]]
0.8561189275285372


In [505]:
cm_pet_category = confusion_matrix(test_y[:,1], y_pred_type)
acc_pet_category = accuracy_score(test_y[:,1], y_pred_type)
print(cm_pet_category)
print(acc_pet_category)

[[   0    6    8    8]
 [   0 1131  267    3]
 [   1  168 1975    6]
 [   1   12   40  141]]
0.8619591186620653


Saving output file

In [506]:
pred_breed_category = clf_breed.predict(X_test)
pred_pet_category = clf_type.predict(X_test)

test_data = pd.read_csv("test.csv")
ind =pd.DataFrame(test_data.pet_id)
Y_breed_category = pd.DataFrame(pred_breed_category, columns = ['breed_category'])
Y_pet_category = pd.DataFrame(pred_pet_category, columns = ['pet_category']) 
test_df_col = pd.concat([ind,Y_breed_category,Y_pet_category], axis=1)
test_df_col.to_csv("pred_file.csv", index=False)