## Mount the drive


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install the CatBoost

In [2]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/90/86/c3dcb600b4f9e7584ed90ea9d30a717fb5c0111574675f442c3e7bc19535/catboost-0.24.1-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.1MB 46kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.1


 It is a multi-label or multi output classification problem. You have to predict two labels: 'breed_catagory' and 'pet_catagory'.
 The best solution here is to train two models.
1. Build one classification model and predict the output.
2. Use the predicted output of 1st model as input feature to 2nd model.

## Import the libraries

In [3]:
import numpy as np 
import pandas as pd 
from sklearn.metrics  import f1_score, confusion_matrix

In [4]:
train_path = '/content/drive/My Drive/Datasets/Hacker_Earth_Challenges/Adopt_a_pet/Dataset/train.csv'
test_path = '/content/drive/My Drive/Datasets/Hacker_Earth_Challenges/Adopt_a_pet/Dataset/test.csv'

## Exploring the data

In [148]:
train=pd.read_csv(train_path)
test=pd.read_csv(test_path)

In [149]:
train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [150]:
test.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0.24,41.21,0,7
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,Black,0.29,8.46,7,1
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,Brown,0.71,30.92,0,7


In [151]:
train.shape,test.shape

((18834, 11), (8072, 9))

In [152]:
train.corr() # condition, X1, X2  imp features

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category
condition,1.0,-0.011219,-0.010793,0.338843,0.381696,-0.483503,-0.04166
length(m),-0.011219,1.0,-0.004464,-0.002893,-0.011175,0.007229,-0.003999
height(cm),-0.010793,-0.004464,1.0,-0.003801,-0.008216,0.011647,0.001976
X1,0.338843,-0.002893,-0.003801,1.0,0.584396,0.240729,-0.032594
X2,0.381696,-0.011175,-0.008216,0.584396,1.0,0.05253,-0.032116
breed_category,-0.483503,0.007229,0.011647,0.240729,0.05253,1.0,0.20923
pet_category,-0.04166,-0.003999,0.001976,-0.032594,-0.032116,0.20923,1.0


### Checking  missing values

In [153]:
train.info() # 1477 missing in col condition

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


In [154]:
test.info() # 619 missing in condition col

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8072 entries, 0 to 8071
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pet_id        8072 non-null   object 
 1   issue_date    8072 non-null   object 
 2   listing_date  8072 non-null   object 
 3   condition     7453 non-null   float64
 4   color_type    8072 non-null   object 
 5   length(m)     8072 non-null   float64
 6   height(cm)    8072 non-null   float64
 7   X1            8072 non-null   int64  
 8   X2            8072 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 567.7+ KB


we found missing values only in the column of 'Condition' column

In [155]:
train['breed_category'].value_counts()

0.0    9000
1.0    8357
2.0    1477
Name: breed_category, dtype: int64

In [156]:
#It means all the missing values belong to a single particular label. So, we can fill them with a unique value like -1.
a=train['breed_category'][(np.isnan(train['condition']))]
a.value_counts()

2.0    1477
Name: breed_category, dtype: int64

## combine train and test data for preprocessing 

In [157]:
#copy all test id to create submission file
test_id=test['pet_id']
td_shape=train.shape[0]

In [158]:
# Save the target variables  
y1=train['breed_category']
y2=train['pet_category']

In [159]:
#combine test and train data
comb_data = pd.concat((train, test)).reset_index(drop=True)
comb_data.drop(['breed_category','pet_category'], axis=1, inplace=True)

# Feature Engineering

### All the 1477 missing values in Condition column are filled with '-1', i used 'mean' and '3' label but their accuracy is less compared to this.

In [160]:
comb_data['condition'].value_counts()
comb_data['condition'].fillna(-1,inplace=True)
comb_data['condition'].value_counts()
comb_data.info() # can see no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26906 entries, 0 to 26905
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pet_id        26906 non-null  object 
 1   issue_date    26906 non-null  object 
 2   listing_date  26906 non-null  object 
 3   condition     26906 non-null  float64
 4   color_type    26906 non-null  object 
 5   length(m)     26906 non-null  float64
 6   height(cm)    26906 non-null  float64
 7   X1            26906 non-null  int64  
 8   X2            26906 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 1.8+ MB


### time difference between issue and listing date(new feature) , the maturity of the pet to be adopted

In [161]:
comb_data['issue_date']=pd.to_datetime(comb_data['issue_date'])
comb_data['listing_date']=pd.to_datetime(comb_data['listing_date'])
comb_data['issue_date'][:5]


0   2016-07-10
1   2013-11-21
2   2014-09-28
3   2016-12-31
4   2017-09-28
Name: issue_date, dtype: datetime64[ns]

In [162]:
x=[]
for d in comb_data['listing_date']:
    y=d.year+(d.month/12.0)+(d.day/365.0)
    x.append(y)
comb_data['modified_listing_date']=x

In [163]:
comb_data['modified_listing_date']

0        2016.807534
1        2019.073973
2        2016.885388
3        2019.151826
4        2017.968721
            ...     
26901    2017.277397
26902    2018.199543
26903    2017.094292
26904    2017.638128
26905    2017.243379
Name: modified_listing_date, Length: 26906, dtype: float64

In [164]:
x=[]
for d in comb_data['issue_date']:
    y=d.year+(d.month/12.0)+(d.day/365.0)
    x.append(y)
comb_data['modified_issue_date']=x

In [165]:
comb_data['took_time']=abs(comb_data['modified_listing_date']-comb_data['modified_issue_date'])

In [166]:
comb_data['took_time'][:5]

0    0.196804
1    5.099772
2    2.058676
3    2.066895
4    0.142009
Name: took_time, dtype: float64

## Pattern in  pet_id

* when the dataset was creating, it might be happened that it was listing with special id for a particular animal.
* just like ANSL_69903.so,all ANSL_6**** i.e., the id starts with 6 maybe a particular animal because we can see that all the ids are not coming serially.
* so extracting the 1st and both 1st and 2nd numbers maybe a good feature.

In [167]:
print(comb_data['pet_id'][0], comb_data['pet_id'][1])
len(comb_data['pet_id'][0])

ANSL_69903 ANSL_66892


10

In [168]:

comb_data['1stnum'] = comb_data['pet_id'].str[:6]
comb_data['1st2num'] = comb_data['pet_id'].str[:7]

## split back to the train and test data

In [169]:
train = comb_data[:td_shape]
test = comb_data[td_shape:]

In [170]:
train.info() # remove 'issue_date', listing_dat, pet_id, modified_listing_date ,  modified_issue_date

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   pet_id                 18834 non-null  object        
 1   issue_date             18834 non-null  datetime64[ns]
 2   listing_date           18834 non-null  datetime64[ns]
 3   condition              18834 non-null  float64       
 4   color_type             18834 non-null  object        
 5   length(m)              18834 non-null  float64       
 6   height(cm)             18834 non-null  float64       
 7   X1                     18834 non-null  int64         
 8   X2                     18834 non-null  int64         
 9   modified_listing_date  18834 non-null  float64       
 10  modified_issue_date    18834 non-null  float64       
 11  took_time              18834 non-null  float64       
 12  1stnum                 18834 non-null  object        
 13  1

In [171]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8072 entries, 18834 to 26905
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   pet_id                 8072 non-null   object        
 1   issue_date             8072 non-null   datetime64[ns]
 2   listing_date           8072 non-null   datetime64[ns]
 3   condition              8072 non-null   float64       
 4   color_type             8072 non-null   object        
 5   length(m)              8072 non-null   float64       
 6   height(cm)             8072 non-null   float64       
 7   X1                     8072 non-null   int64         
 8   X2                     8072 non-null   int64         
 9   modified_listing_date  8072 non-null   float64       
 10  modified_issue_date    8072 non-null   float64       
 11  took_time              8072 non-null   float64       
 12  1stnum                 8072 non-null   object        
 13

In [172]:
#drop some unnecessary features
x=train.drop(['pet_id','issue_date','listing_date'],axis=1) #  'modified_issue_date''modified_listing_date'
test=test.drop(['pet_id','issue_date','listing_date'],axis=1) #'modified_issue_date', 'modified_listing_date'


In [173]:
x.info() # object dtype indices = 1, 9, 10 are cat_features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   condition              18834 non-null  float64
 1   color_type             18834 non-null  object 
 2   length(m)              18834 non-null  float64
 3   height(cm)             18834 non-null  float64
 4   X1                     18834 non-null  int64  
 5   X2                     18834 non-null  int64  
 6   modified_listing_date  18834 non-null  float64
 7   modified_issue_date    18834 non-null  float64
 8   took_time              18834 non-null  float64
 9   1stnum                 18834 non-null  object 
 10  1st2num                18834 non-null  object 
dtypes: float64(6), int64(2), object(3)
memory usage: 1.6+ MB


In [174]:
categorical_fea = [1,9,10]

* **handle categorical variable**

In [175]:
x.select_dtypes(exclude='number').columns.to_list()

['color_type', '1stnum', '1st2num']



# Catboost handles the categorical values, no need encode

You can see that both shapes are not same. Train has 97 and test has 95 columns. It means the train and test data contain 2 extra columns after one-hot endcoding. We have to remove these 2 columns from the train data.

In [176]:
print(set(x.columns))
print(set(test.columns))

{'took_time', 'color_type', 'height(cm)', 'X2', 'length(m)', '1stnum', '1st2num', 'modified_issue_date', 'condition', 'modified_listing_date', 'X1'}
{'took_time', 'color_type', 'height(cm)', 'X2', 'length(m)', '1stnum', '1st2num', 'modified_issue_date', 'condition', 'modified_listing_date', 'X1'}


In [177]:
#again combining
comb_data = pd.concat((x, test)).reset_index(drop=True)

In [178]:
comb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26906 entries, 0 to 26905
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   condition              26906 non-null  float64
 1   color_type             26906 non-null  object 
 2   length(m)              26906 non-null  float64
 3   height(cm)             26906 non-null  float64
 4   X1                     26906 non-null  int64  
 5   X2                     26906 non-null  int64  
 6   modified_listing_date  26906 non-null  float64
 7   modified_issue_date    26906 non-null  float64
 8   took_time              26906 non-null  float64
 9   1stnum                 26906 non-null  object 
 10  1st2num                26906 non-null  object 
dtypes: float64(6), int64(2), object(3)
memory usage: 2.3+ MB


# split data for 1st model i.e., pet_category prediction

In [179]:
x = comb_data[:td_shape]
test = comb_data[td_shape:]

In [180]:
from sklearn.model_selection import train_test_split
x1_train,x1_test,y1_train,y1_test=train_test_split(x,y2,test_size=0.2,random_state=0,shuffle=True)

# model 1 build i.e pet_category prediction

1. Learning rate set to 0.113996, 1000 iterations, score = 90.48
2. Learning rate set to 0.148553, iterations = 500, 355, score = 90.83
3. LEarning rate set to 0.314819, iterations = 70, score = 90.65
4. learning_rate = 0.150553,iterations=386, score = 90.61




In [181]:
from catboost import CatBoostClassifier #learning_rate = 0.148553,iterations = 55,learning_rate =0.357999,learning_rate =0.216294 : 187
model1 = CatBoostClassifier(learning_rate = 0.148553,iterations=355,cat_features = categorical_fea,loss_function='MultiClass', use_best_model=True) #learning_rate=0.05,
model1.fit(x1_train, y1_train, eval_set=(x1_test, y1_test))

0:	learn: 1.0750403	test: 1.0746283	best: 1.0746283 (0)	total: 62ms	remaining: 22s
1:	learn: 0.9027654	test: 0.9015450	best: 0.9015450 (1)	total: 109ms	remaining: 19.2s
2:	learn: 0.7812986	test: 0.7798992	best: 0.7798992 (2)	total: 160ms	remaining: 18.7s
3:	learn: 0.6844208	test: 0.6819118	best: 0.6819118 (3)	total: 226ms	remaining: 19.8s
4:	learn: 0.6122170	test: 0.6089344	best: 0.6089344 (4)	total: 299ms	remaining: 20.9s
5:	learn: 0.5532420	test: 0.5496501	best: 0.5496501 (5)	total: 367ms	remaining: 21.3s
6:	learn: 0.5102623	test: 0.5066405	best: 0.5066405 (6)	total: 440ms	remaining: 21.9s
7:	learn: 0.4705051	test: 0.4664597	best: 0.4664597 (7)	total: 500ms	remaining: 21.7s
8:	learn: 0.4397278	test: 0.4357808	best: 0.4357808 (8)	total: 562ms	remaining: 21.6s
9:	learn: 0.4137835	test: 0.4095718	best: 0.4095718 (9)	total: 617ms	remaining: 21.3s
10:	learn: 0.3933516	test: 0.3887787	best: 0.3887787 (10)	total: 660ms	remaining: 20.6s
11:	learn: 0.3770387	test: 0.3723696	best: 0.3723696 (1

<catboost.core.CatBoostClassifier at 0x7f198591c518>

In [182]:
from sklearn.metrics import f1_score, confusion_matrix
y1_pred = model1.predict(x1_test)
score = f1_score(y1_test, y1_pred, average='weighted') * 100
score

90.83956362722982

# ***Without modified dates columns***

Randomly get the learning rates, by setting diff iterations and fix the learning rate and find the best iterations for that learning rate.

1. Learning rate set to 0.244274, iterations 136, f1_score = 90.09
  1. iterations = 109, score = 90.09
  2. iterations = 101, score = 90.11
  3. iterations = 100, score = 90.06
2. Learning rate set to 0.24566, iterations 134, f1_score = 90.17
  1. iterations = 175 , score = 90.06
  2. iterations = 139, score = 90.17
  3. iterations = 131, score = 90.1754
3. Learning rate set to 0.24852, iterations 130, f1_score 90.03
  1. iterations = 1000, score = 90.24
  2. iterations = 110, score = 90.14
  3. iterations = 100, score = 90.1718
4. Learning rate set to 0.113996, iterations = 1000, f1_score = 90.21
  1. iterations = 380, score = 90.215
  2. iterations = 213, score = 90.2244
  3. iterations = 212, score = 90.2260
5. Learning rate set to 0.210812, iterations = 200, f1_score = 90.17

6. Learning rate set t0 0.21947, iterations = 1000, f1_score = 90.222
  1. iterations = 292, score = 90.2501
  2. iterations = 282, score = 90.27
  3. iterations = 280, score = 90.3572

In [None]:
from sklearn.model_selection import GridSearchCV
grid = {'learning_rate': [0.113996, 0.24852, 0.24566, 0.244274 ] ,
        'iterations' : [131, 212, 101]
}
grid_search = GridSearchCV(estimator = model1,
                           param_grid = grid,
                           scoring = 'accuracy',
                           cv = 3
                          )
grid_search = grid_search.fit(x1_train, y1_train, eval_set=(x1_test, y1_test))
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

# build 2nd model i.e predict breed_category

We will use the output of **MODEL 1** as an input feature of **MODEL 2** .Trust me, it will increase your score.

build new dataset for model 2

In [184]:
#new_feat is new feature i.e the predicted pet_category of model 1 for train data
new_feat=model1.predict(x)
#output1 is new first output i.e the predicted pet_category of model 1 for test data
output1=model1.predict(test)
#vld1 is validation 1 i.e we'll check score with the predicted result of validation data of model 1
vld1=model1.predict(x1_test)

In [185]:
names = comb_data.columns
x2 = pd.DataFrame(x, columns=names)
test2 = pd.DataFrame(test, columns=names)

In [186]:
#the predicted pet_category of model 1 for train data is used as a input variable or feature of the train data of model 2
x2['output1']=new_feat
#the predicted pet_category of model 1 for test data is used as a input variable or feature of the test data of model 2
test2['output1']=output1

In [187]:
x2.info() #object type indices = 1,9,10

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   condition              18834 non-null  float64
 1   color_type             18834 non-null  object 
 2   length(m)              18834 non-null  float64
 3   height(cm)             18834 non-null  float64
 4   X1                     18834 non-null  int64  
 5   X2                     18834 non-null  int64  
 6   modified_listing_date  18834 non-null  float64
 7   modified_issue_date    18834 non-null  float64
 8   took_time              18834 non-null  float64
 9   1stnum                 18834 non-null  object 
 10  1st2num                18834 non-null  object 
 11  output1                18834 non-null  int64  
dtypes: float64(6), int64(3), object(3)
memory usage: 1.7+ MB


**split data for model 2**

In [188]:
x2_train,x2_test,y2_train,y2_test=train_test_split(x2,y1,test_size=0.2,random_state=0)

**Build model 2 and train the model with new data**

1. Learning rate set to 0.148569743, iterations = 154, score = 91.17
2. Learning rate set to 0.148569743, iterations = 200, score = 91.37

In [189]:
model2 = CatBoostClassifier(iterations=150, cat_features=[1,9,10,],loss_function='MultiClass') #learning_rate=0.05,iterations =176,learning_rate = 0.106326,
model2.fit(x2_train, y2_train, eval_set = (x2_test, y2_test))

Learning rate set to 0.2353
0:	learn: 0.7609699	test: 0.7561024	best: 0.7561024 (0)	total: 29.9ms	remaining: 4.45s
1:	learn: 0.5858817	test: 0.5790348	best: 0.5790348 (1)	total: 54.6ms	remaining: 4.04s
2:	learn: 0.4740964	test: 0.4659343	best: 0.4659343 (2)	total: 81.2ms	remaining: 3.98s
3:	learn: 0.3959788	test: 0.3871215	best: 0.3871215 (3)	total: 111ms	remaining: 4.04s
4:	learn: 0.3402915	test: 0.3310740	best: 0.3310740 (4)	total: 136ms	remaining: 3.96s
5:	learn: 0.2984748	test: 0.2888590	best: 0.2888590 (5)	total: 161ms	remaining: 3.87s
6:	learn: 0.2669225	test: 0.2570997	best: 0.2570997 (6)	total: 185ms	remaining: 3.79s
7:	learn: 0.2430693	test: 0.2330696	best: 0.2330696 (7)	total: 213ms	remaining: 3.78s
8:	learn: 0.2247735	test: 0.2145805	best: 0.2145805 (8)	total: 245ms	remaining: 3.85s
9:	learn: 0.2095047	test: 0.2000898	best: 0.2000898 (9)	total: 270ms	remaining: 3.79s
10:	learn: 0.1960777	test: 0.1870619	best: 0.1870619 (10)	total: 293ms	remaining: 3.7s
11:	learn: 0.1873391	t

<catboost.core.CatBoostClassifier at 0x7f1992216160>

In [191]:
#output 2 is the predicted breed_category of model 2 for test data
output2=model2.predict(test)
#vld2 is validation 2 i.e we'll check score with the predicted result of validation data of model 2
vld2=model2.predict(x2_test)

In [190]:
y2_pred = model2.predict(x2_test)
score = f1_score(y2_test, y2_pred, average='weighted') * 100
score

91.12028270883363

In [192]:
y2_pred = np.array(y2_pred)
y2_test = np.array(y2_test)
cm = confusion_matrix(y2_test,y2_pred)
cm

array([[1656,  121,    0],
       [ 213, 1463,    0],
       [   0,    0,  314]])

In [None]:
set(y2_test)

# Check Accuracy

In [193]:
s1=f1_score(y1_test,vld1,average='weighted')
s2=f1_score(y2_test,vld2,average='weighted')
accuracy=100*((s1+s2)/2)
accuracy

90.97992316803172

In [None]:
vld1 = np.array(vld1)
y1_test = np.array(y1_test)
cm = confusion_matrix(y1_test, vld1, labels = [0,1,2,4])
cm

# Create Submission file

In [203]:
d = {'pet_id':test_id, 'breed_category':output2, 'pet_category':output1}

In [200]:
output1 = output1.reshape(-1)
output2 = output2.reshape(-1)

In [201]:
print(output2.shape,output1.shape)

(8072,) (8072,)


In [204]:
datafra = pd.DataFrame(d)

In [205]:
datafra.to_csv('Submission5.csv',index=False) 

In [None]:
y1.value_counts()

In [None]:
y2.value_counts()

In [None]:
# pet_category == 0 is rare

In [None]:
# submission5.csv got 90.33 marks