In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.tree import plot_tree 

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)

In [3]:
X_train = pd.get_dummies(train.drop('Status', axis=1), 
                         drop_first=True)
le = LabelEncoder()
y_train = le.fit_transform(train['Status'])
print(le.classes_)

['C' 'CL' 'D']


In [4]:
dtc = DecisionTreeClassifier(random_state=24)
bagg = BaggingClassifier(dtc, random_state=24, n_estimators=30)

params = {'estimator__min_samples_split':np.arange(2,35,5),
          'estimator__min_samples_leaf':np.arange(1, 35, 5),
          'estimator__max_depth':[None, 4, 3, 2]}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

gcv = GridSearchCV(bagg, param_grid=params, cv=kfold, scoring='neg_log_loss', n_jobs=-1)

gcv.fit(X_train, y_train)
print(gcv.best_params_)
print(gcv.best_score_)

{'estimator__max_depth': None, 'estimator__min_samples_leaf': 26, 'estimator__min_samples_split': 2}
-0.4876000722662841


In [5]:
### Inferencing
best_tree = gcv.best_estimator_
dum_tst = pd.get_dummies(test, drop_first=True)
y_pred_prob = best_tree.predict_proba(dum_tst)

In [7]:
print(y_pred_prob)

[[0.74294564 0.0257691  0.23128526]
 [0.75037112 0.09631734 0.15331153]
 [0.08902969 0.09990862 0.81106169]
 ...
 [0.88097891 0.02532051 0.09370058]
 [0.98206032 0.01068063 0.00725905]
 [0.27471392 0.04763809 0.677648  ]]


In [6]:
submit = pd.DataFrame({'id':list(test.index),
                       'Status_C':y_pred_prob[:,0],
                       'Status_CL':y_pred_prob[:,1],
                       'Status_D':y_pred_prob[:,2]})
submit.to_csv("sbt_dtc.csv", index=False)