In [69]:
import catboost
from catboost import CatBoostClassifier, CatBoostRegressor
import xgboost
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [49]:
hr_data = pd.read_csv('HR_comma_sep.csv')
X = hr_data.drop(columns='left')
y = hr_data.left
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3, stratify=y)
X

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,0,sales,low
1,0.80,0.86,5,262,6,0,0,sales,medium
2,0.10,0.77,6,247,4,0,0,sales,low
3,0.92,0.85,5,259,5,0,0,sales,low
4,0.89,1.00,5,224,5,0,0,sales,low
...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,0,support,low
14991,0.37,0.48,2,160,3,0,0,support,low
14992,0.37,0.53,2,143,3,0,0,support,low
14993,0.11,0.96,6,280,4,0,0,support,low


In [50]:
le = LabelEncoder()
y = le.fit_transform(y)
dict(zip(list(np.unique(y)),le.classes_))

{0: 0, 1: 1}

### CatBoost without One Hot Encoding

In [51]:
cbc = CatBoostClassifier(random_state=24, cat_features=['Department','salary'])
cbc.fit(X_train,y_train)

Learning rate set to 0.028113
0:	learn: 0.6521748	total: 58.7ms	remaining: 58.6s
1:	learn: 0.6108150	total: 101ms	remaining: 50.5s
2:	learn: 0.5713910	total: 150ms	remaining: 49.8s
3:	learn: 0.5320913	total: 199ms	remaining: 49.5s
4:	learn: 0.5001841	total: 242ms	remaining: 48.1s
5:	learn: 0.4684059	total: 289ms	remaining: 47.9s
6:	learn: 0.4390173	total: 337ms	remaining: 47.8s
7:	learn: 0.4134991	total: 383ms	remaining: 47.5s
8:	learn: 0.3904645	total: 432ms	remaining: 47.5s
9:	learn: 0.3706840	total: 483ms	remaining: 47.8s
10:	learn: 0.3509874	total: 531ms	remaining: 47.8s
11:	learn: 0.3328178	total: 579ms	remaining: 47.7s
12:	learn: 0.3167978	total: 632ms	remaining: 48s
13:	learn: 0.3023680	total: 679ms	remaining: 47.8s
14:	learn: 0.2893689	total: 728ms	remaining: 47.8s
15:	learn: 0.2766897	total: 776ms	remaining: 47.7s
16:	learn: 0.2649461	total: 824ms	remaining: 47.7s
17:	learn: 0.2543803	total: 873ms	remaining: 47.6s
18:	learn: 0.2438560	total: 921ms	remaining: 47.6s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x2887c2b8740>

In [52]:
y_pred = cbc.predict(X_test)
accuracy_score(y_test, y_pred)

0.9822182707268282

In [53]:
cbc = CatBoostClassifier(random_state=24, cat_features=['Department','salary'])

kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=24)
params= {'max_depth':[2,3,4],'n_estimators':[10,50],'learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(cbc, param_grid=params,cv =kfold, scoring='roc_auc', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.6925472	total: 9.66ms	remaining: 87ms
1:	learn: 0.6919348	total: 25.5ms	remaining: 102ms
2:	learn: 0.6913254	total: 47ms	remaining: 110ms
3:	learn: 0.6907172	total: 57.8ms	remaining: 86.7ms
4:	learn: 0.6901085	total: 68.2ms	remaining: 68.2ms
5:	learn: 0.6895010	total: 82.6ms	remaining: 55ms
6:	learn: 0.6888947	total: 97.1ms	remaining: 41.6ms
7:	learn: 0.6882913	total: 112ms	remaining: 27.9ms
8:	learn: 0.6876875	total: 125ms	remaining: 13.9ms
9:	learn: 0.6870981	total: 138ms	remaining: 0us
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.911 total time=   0.1s
0:	learn: 0.6925453	total: 18.3ms	remaining: 164ms
1:	learn: 0.6919295	total: 34.1ms	remaining: 136ms
2:	learn: 0.6913171	total: 53ms	remaining: 124ms
3:	learn: 0.6907059	total: 64ms	remaining: 95.9ms
4:	learn: 0.6900937	total: 75.1ms	remaining: 75.1ms
5:	learn: 0.6894828	total: 89.8ms	remaining: 59.9ms
6:	learn: 0.6888731	total: 104m

In [54]:
print("Score :",gcv.best_score_)
print("Params :",gcv.best_params_)

Score : 0.9893002633546979
Params : {'learning_rate': 1.0, 'max_depth': 4, 'n_estimators': 50}


# Medical Cost Insurance Dataset

In [62]:
med = pd.read_csv('insurance.csv')
med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [65]:
X = med.drop(columns='charges')
y = med.charges

In [70]:
cbc = CatBoostRegressor(random_state=24, cat_features = list(X.columns[X.dtypes==object]))

kfold = KFold(shuffle=True, n_splits=5, random_state=24)
params= {'max_depth':[2,3,4],'n_estimators':[10,50],'learning_rate': np.linspace(0.001,1,5)}
gcv = GridSearchCV(cbc, param_grid=params,cv =kfold, scoring='r2', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 12036.2666510	total: 7.52ms	remaining: 67.7ms
1:	learn: 12027.5885074	total: 17.1ms	remaining: 68.5ms
2:	learn: 12018.7579696	total: 32.7ms	remaining: 76.3ms
3:	learn: 12010.0080421	total: 44.3ms	remaining: 66.5ms
4:	learn: 12001.6612206	total: 55.1ms	remaining: 55.1ms
5:	learn: 11992.8131380	total: 67.2ms	remaining: 44.8ms
6:	learn: 11984.5229977	total: 85.6ms	remaining: 36.7ms
7:	learn: 11975.9771850	total: 99ms	remaining: 24.8ms
8:	learn: 11967.4506001	total: 109ms	remaining: 12.1ms
9:	learn: 11958.6367854	total: 119ms	remaining: 0us
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.014 total time=   0.1s
0:	learn: 12050.3139519	total: 6.98ms	remaining: 62.8ms
1:	learn: 12041.5605243	total: 13.8ms	remaining: 55.1ms
2:	learn: 12032.6556923	total: 24.2ms	remaining: 56.4ms
3:	learn: 12023.9006792	total: 40.5ms	remaining: 60.7ms
4:	learn: 12015.5102852	total: 51.9ms	remaining: 51.9ms
5:	learn:

In [71]:
print("Score :",gcv.best_score_)
print("Params :",gcv.best_params_)

Score : 0.8584148378430502
Params : {'learning_rate': 0.5005, 'max_depth': 2, 'n_estimators': 50}
