In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [2]:
kyp = pd.read_csv(r'..\Cases\Kyphosis\Kyphosis.csv')
kyp.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [3]:
le = LabelEncoder()
y = le.fit_transform(kyp['Kyphosis'])
X = kyp.drop('Kyphosis', axis=1)
print(X)
print(le.classes_)

    Age  Number  Start
0    71       3      5
1   158       3     14
2   128       4      5
3     2       5      1
4     1       4     15
..  ...     ...    ...
76  157       3     13
77   26       7     13
78  120       2     13
79   42       7      6
80   36       4     13

[81 rows x 3 columns]
['absent' 'present']


In [4]:
gbm = GradientBoostingClassifier(random_state=24)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=24,stratify=y)

In [6]:
gbm.fit(X_train, y_train)

y_pred = gbm.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1])

In [7]:
accuracy_score(y_test, y_pred)

0.68

In [8]:
y_pred_proba = gbm.predict_proba(X_test)
print(y_pred_proba)
print(log_loss(y_test, y_pred_proba))

[[9.99952357e-01 4.76434186e-05]
 [2.43443781e-02 9.75655622e-01]
 [9.88314097e-01 1.16859032e-02]
 [2.83196103e-04 9.99716804e-01]
 [6.60063034e-04 9.99339937e-01]
 [4.42293030e-02 9.55770697e-01]
 [9.98663464e-01 1.33653632e-03]
 [9.98639575e-01 1.36042467e-03]
 [9.99840173e-01 1.59827074e-04]
 [4.11966673e-02 9.58803333e-01]
 [9.36209748e-01 6.37902523e-02]
 [9.99892670e-01 1.07330346e-04]
 [1.04454837e-03 9.98955452e-01]
 [9.99715394e-01 2.84605580e-04]
 [9.98639575e-01 1.36042467e-03]
 [1.25638715e-02 9.87436129e-01]
 [7.48764724e-01 2.51235276e-01]
 [9.98495665e-01 1.50433461e-03]
 [1.19322190e-01 8.80677810e-01]
 [9.99892670e-01 1.07330346e-04]
 [1.39662402e-01 8.60337598e-01]
 [9.99681229e-01 3.18770771e-04]
 [9.97655284e-01 2.34471631e-03]
 [6.15042923e-04 9.99384957e-01]
 [1.25638715e-02 9.87436129e-01]]
1.6160680120237196


In [9]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
print(gbm.get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 24, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [11]:
params = {'learning_rate': np.linspace(0.001, 0.9,10), 
          'max_depth': [None,3,2,4], 
          'n_estimators':[25, 50 ,100],  
          }
gcv = GridSearchCV(gbm, param_grid=params, cv=kfold, 
                   scoring='neg_log_loss', n_jobs=-1)
gcv.fit(X, y)

print(gcv.best_score_)
print(gcv.best_params_)

-0.48414542978355896
{'learning_rate': 0.10088888888888889, 'max_depth': 2, 'n_estimators': 25}


In [12]:
pd_cv = pd.DataFrame(gcv.cv_results_)
pd_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.086155,0.021436,0.007446,0.006801,0.001,,25,"{'learning_rate': 0.001, 'max_depth': None, 'n...",-0.537613,-0.492105,-0.474406,-0.476208,-0.562283,-0.508523,0.035254,12
1,0.172038,0.037894,0.008626,0.004847,0.001,,50,"{'learning_rate': 0.001, 'max_depth': None, 'n...",-0.527908,-0.499371,-0.465147,-0.468687,-0.556230,-0.503468,0.034860,8
2,0.344314,0.026314,0.009591,0.007172,0.001,,100,"{'learning_rate': 0.001, 'max_depth': None, 'n...",-0.511680,-0.514238,-0.448333,-0.456616,-0.547400,-0.495653,0.037532,3
3,0.070374,0.008464,0.007201,0.007845,0.001,3,25,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",-0.538514,-0.488916,-0.478638,-0.478370,-0.557077,-0.508303,0.032995,11
4,0.113857,0.009846,0.005638,0.000596,0.001,3,50,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",-0.529599,-0.492960,-0.472731,-0.472401,-0.545459,-0.502630,0.029876,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.091044,0.013620,0.007046,0.007672,0.9,2,50,"{'learning_rate': 0.9, 'max_depth': 2, 'n_esti...",-1.683627,-1.355927,-0.737481,-1.324479,-1.490426,-1.318388,0.316827,48
116,0.176401,0.019434,0.007930,0.006466,0.9,2,100,"{'learning_rate': 0.9, 'max_depth': 2, 'n_esti...",-2.688398,-2.338330,-1.428659,-2.462272,-2.314448,-2.246421,0.429824,75
117,0.080579,0.009872,0.009378,0.007657,0.9,4,25,"{'learning_rate': 0.9, 'max_depth': 4, 'n_esti...",-2.269810,-2.234582,-1.043880,-1.854586,-3.750435,-2.230659,0.878765,73
118,0.110232,0.005336,0.000000,0.000000,0.9,4,50,"{'learning_rate': 0.9, 'max_depth': 4, 'n_esti...",-3.114006,-3.056522,-1.609058,-2.800792,-4.450496,-3.006175,0.904914,89
