In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [2]:
kyp = pd.read_csv(r'..\Cases\Kyphosis\Kyphosis.csv')
kyp.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [3]:
le = LabelEncoder()
y = le.fit_transform(kyp['Kyphosis'])
X = kyp.drop('Kyphosis', axis=1)
print(X)
print(le.classes_)

    Age  Number  Start
0    71       3      5
1   158       3     14
2   128       4      5
3     2       5      1
4     1       4     15
..  ...     ...    ...
76  157       3     13
77   26       7     13
78  120       2     13
79   42       7      6
80   36       4     13

[81 rows x 3 columns]
['absent' 'present']


In [4]:
svm_l = SVC(kernel='linear', probability=True, random_state=24)
std_scalar = StandardScaler()
pipe_l = Pipeline([('SCL', std_scalar), ('SVM',svm_l)])

In [5]:
svm_r = SVC(kernel='rbf', probability=True, random_state=24)
std_scalar = StandardScaler()
pipe_r = Pipeline([('SCL', std_scalar), ('SVM',svm_l)])

In [6]:
lr = LogisticRegression()
lda = LinearDiscriminantAnalysis()
dtc = DecisionTreeClassifier(random_state=24)

In [7]:
voting = VotingClassifier([('LR', lr), ('SVM_L', pipe_l),
                           ('SVM_R', pipe_r), ('LDA', lda),
                           ('TREE', dtc)], voting='soft')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=2021,stratify=y)

In [9]:
voting.fit(X_train, y_train)

In [10]:
y_pred = voting.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0]


In [11]:
accuracy_score(y_test, y_pred)

0.8

In [12]:
y_pred_proba = voting.predict_proba(X_test)
print(y_pred_proba)
print(log_loss(y_test, y_pred_proba))

[[0.95538873 0.04461127]
 [0.95517135 0.04482865]
 [0.97577151 0.02422849]
 [0.96754412 0.03245588]
 [0.95565851 0.04434149]
 [0.87994952 0.12005048]
 [0.61596098 0.38403902]
 [0.65572756 0.34427244]
 [0.83766533 0.16233467]
 [0.92937554 0.07062446]
 [0.960116   0.039884  ]
 [0.98167197 0.01832803]
 [0.5508986  0.4491014 ]
 [0.98058315 0.01941685]
 [0.66259627 0.33740373]
 [0.94463141 0.05536859]
 [0.05797014 0.94202986]
 [0.98089193 0.01910807]
 [0.89954408 0.10045592]
 [0.97166495 0.02833505]
 [0.98478822 0.01521178]
 [0.3241657  0.6758343 ]
 [0.98509823 0.01490177]
 [0.96691971 0.03308029]
 [0.95889136 0.04110864]]
0.46960833856709255


In [13]:
y_pred_proba = voting.predict_proba(X_test)[:,1]
print(y_pred_proba)

print(log_loss(y_test, y_pred_proba))

[0.04461127 0.04482865 0.02422849 0.03245588 0.04434149 0.12005048
 0.38403902 0.34427244 0.16233467 0.07062446 0.039884   0.01832803
 0.4491014  0.01941685 0.33740373 0.05536859 0.94202986 0.01910807
 0.10045592 0.02833505 0.01521178 0.6758343  0.01490177 0.03308029
 0.04110864]
0.46960833856709255


### GridSearch

In [14]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
print(voting.get_params())

{'estimators': [('LR', LogisticRegression()), ('SVM_L', Pipeline(steps=[('SCL', StandardScaler()),
                ('SVM',
                 SVC(kernel='linear', probability=True, random_state=24))])), ('SVM_R', Pipeline(steps=[('SCL', StandardScaler()),
                ('SVM',
                 SVC(kernel='linear', probability=True, random_state=24))])), ('LDA', LinearDiscriminantAnalysis()), ('TREE', DecisionTreeClassifier(random_state=24))], 'flatten_transform': True, 'n_jobs': None, 'verbose': False, 'voting': 'soft', 'weights': None, 'LR': LogisticRegression(), 'SVM_L': Pipeline(steps=[('SCL', StandardScaler()),
                ('SVM',
                 SVC(kernel='linear', probability=True, random_state=24))]), 'SVM_R': Pipeline(steps=[('SCL', StandardScaler()),
                ('SVM',
                 SVC(kernel='linear', probability=True, random_state=24))]), 'LDA': LinearDiscriminantAnalysis(), 'TREE': DecisionTreeClassifier(random_state=24), 'LR__C': 1.0, 'LR__class_weight': Non

In [15]:
# Sir Code
params = {'SVM_L__SVM__C': np.linspace(0.001, 3,5), 
          'SVM_R__SVM__C': np.linspace(0.001, 3,5), 
          'SVM_R__SVM__gamma':np.linspace(0.001, 3,5),
          'LR__C': np.linspace(0.001, 3,5), 
          'TREE__max_depth': [None,3,2], 
          }
gcv = GridSearchCV(voting, param_grid=params, cv=kfold, 
                   scoring='neg_log_loss', n_jobs=-1)
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.4160380599620659
{'LR__C': 3.0, 'SVM_L__SVM__C': 0.75075, 'SVM_R__SVM__C': 0.75075, 'SVM_R__SVM__gamma': 0.001, 'TREE__max_depth': 2}


In [None]:
params = {'SVM_L__SVM__C': np.linspace(0.001, 3,10), 
          'SVM_L__SVM__gamma':np.linspace(0.001, 3,10), 
          'SVM_R__SVM__C': np.linspace(0.001, 3,10), 
          'SVM_R__SVM__gamma':np.linspace(0.001, 3,10),
          'LR__C': np.linspace(0.001, 3,10), 
          'TREE__max_depth': [None,4,3,2], 
          'TREE__min_samples_split': [2,4,6,10,20],
          'TREE__min_samples_leaf': [1,5,10,15]
          }
gcv = GridSearchCV(voting, param_grid=params, cv=kfold, 
                   scoring='neg_log_loss', n_jobs=-1)
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

# DO NOT RUN 
# max runtime: 2414 mins