In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:

le = LabelEncoder()

y = le.fit_transform(train['Status'])
X = train.drop('Status', axis=1)

In [5]:
X = pd.get_dummies(X, drop_first=True)

In [6]:
lightxbc = LGBMClassifier(random_state = 24)
xgbc = XGBClassifier(random_state = 24)
catbc = CatBoostClassifier(random_state=24)

In [16]:
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
print(xgbc.get_params())
print(catbc.get_params())
print(lightxbc.get_params())

{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 24, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
{'random_state': 24}
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importa

In [29]:
params = {'learning_rate': np.linspace(0.001, 0.9,10), 
          'max_depth': [None,3,2,4], 
          'n_estimators':[25, 50 ,100],}

xgbc_gcv = GridSearchCV(xgbc, param_grid=params, cv=kfold, scoring='neg_log_loss', n_jobs=-1)

xgbc_gcv.fit(X, y)

print(xgbc_gcv.best_params_)
print(xgbc_gcv.best_score_)

{'learning_rate': 0.30066666666666664, 'max_depth': 2, 'n_estimators': 100}
-0.4487740234341403


In [30]:
test1 = pd.get_dummies(test, drop_first=True)
y_pred_prob = xgbc_gcv.best_estimator_.predict_proba(test1)
test1

Unnamed: 0,id,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Drug_Placebo,Sex_M,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Edema_S,Edema_Y
0,7905,3839,19724,1.2,546.0,3.37,65.0,1636.0,151.90,90.0,430.0,10.6,2.0,False,False,False,True,False,False,False
1,7906,2468,14975,1.1,660.0,4.22,94.0,1257.0,151.90,155.0,227.0,10.0,2.0,False,False,False,False,False,False,False
2,7907,51,13149,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0,True,False,False,True,False,False,True
3,7908,2330,20510,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0,False,False,False,False,False,False,False
4,7909,1615,21904,1.4,277.0,2.97,121.0,1110.0,125.00,126.0,221.0,9.8,1.0,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13171,2870,12279,1.3,302.0,3.43,75.0,1345.0,145.00,44.0,181.0,10.6,3.0,True,False,False,False,False,False,False
5267,13172,1770,24803,0.5,219.0,4.09,121.0,663.0,79.05,94.0,311.0,9.7,3.0,True,False,False,False,False,False,False
5268,13173,3707,16990,0.8,315.0,4.09,13.0,1637.0,170.50,70.0,426.0,10.9,3.0,False,False,False,True,False,False,False
5269,13174,1216,11773,0.7,329.0,3.80,52.0,678.0,57.00,126.0,306.0,10.2,1.0,True,False,False,False,False,False,False


In [31]:
submit = pd.DataFrame({"id":test['id'],"Status_C":y_pred_prob[:,0], 'Status_CL':y_pred_prob[:,1], 'Status_D':y_pred_prob[:,2]})
submit.to_csv("xgboost.csv", index=False)

In [32]:
catbcgcv = GridSearchCV(catbc, param_grid=params, cv=kfold, scoring='neg_log_loss', n_jobs=-1)

catbcgcv.fit(X, y)

print(catbcgcv.best_params_)
print(catbcgcv.best_score_)

0:	learn: 0.7531913	total: 5.64ms	remaining: 558ms
1:	learn: 0.6407350	total: 10.3ms	remaining: 504ms
2:	learn: 0.5766822	total: 14.3ms	remaining: 461ms
3:	learn: 0.5413125	total: 18.2ms	remaining: 437ms
4:	learn: 0.5204410	total: 22.5ms	remaining: 427ms
5:	learn: 0.5064707	total: 27.2ms	remaining: 426ms
6:	learn: 0.4983057	total: 31.8ms	remaining: 422ms
7:	learn: 0.4924942	total: 36ms	remaining: 414ms
8:	learn: 0.4883399	total: 40.2ms	remaining: 406ms
9:	learn: 0.4814491	total: 44.5ms	remaining: 401ms
10:	learn: 0.4775758	total: 48.4ms	remaining: 391ms
11:	learn: 0.4739194	total: 52.1ms	remaining: 382ms
12:	learn: 0.4701048	total: 56.8ms	remaining: 380ms
13:	learn: 0.4674638	total: 61.2ms	remaining: 376ms
14:	learn: 0.4643852	total: 65.2ms	remaining: 370ms
15:	learn: 0.4617020	total: 69.1ms	remaining: 363ms
16:	learn: 0.4593360	total: 73.3ms	remaining: 358ms
17:	learn: 0.4555715	total: 77.4ms	remaining: 352ms
18:	learn: 0.4532281	total: 81.5ms	remaining: 348ms
19:	learn: 0.4515987	tot

In [33]:
y_pred_prob = catbcgcv.best_estimator_.predict_proba(test1)
submit = pd.DataFrame({"id":test['id'],"Status_C":y_pred_prob[:,0], 'Status_CL':y_pred_prob[:,1], 'Status_D':y_pred_prob[:,2]})
submit.to_csv("cat.csv", index=False)

In [34]:
lightxbc_gcv = GridSearchCV(lightxbc, param_grid=params, cv=kfold, scoring='neg_log_loss', n_jobs=-1)

lightxbc_gcv.fit(X, y)

print(lightxbc_gcv.best_params_)
print(lightxbc_gcv.best_score_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2185
[LightGBM] [Info] Number of data points in the train set: 7905, number of used features: 20
[LightGBM] [Info] Start training from score -0.465082
[LightGBM] [Info] Start training from score -3.358480
[LightGBM] [Info] Start training from score -1.087291
{'learning_rate': 0.30066666666666664, 'max_depth': 2, 'n_estimators': 100}
-0.4495427545074241


In [35]:
y_pred_prob = lightxbc_gcv.best_estimator_.predict_proba(test1)
submit = pd.DataFrame({"id":test['id'],"Status_C":y_pred_prob[:,0], 'Status_CL':y_pred_prob[:,1], 'Status_D':y_pred_prob[:,2]})
submit.to_csv("light.csv", index=False)