In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [13]:
kyp = pd.read_csv(r'..\Cases\Kyphosis\Kyphosis.csv')
kyp.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [14]:
le = LabelEncoder()
y = le.fit_transform(kyp['Kyphosis'])
X = kyp.drop('Kyphosis', axis=1)
print(X)
print(le.classes_)

    Age  Number  Start
0    71       3      5
1   158       3     14
2   128       4      5
3     2       5      1
4     1       4     15
..  ...     ...    ...
76  157       3     13
77   26       7     13
78  120       2     13
79   42       7      6
80   36       4     13

[81 rows x 3 columns]
['absent' 'present']


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=24,stratify=y)

In [16]:
lr = LogisticRegression()
svm = SVC(kernel='linear', probability=True, random_state=24)
dtc = DecisionTreeClassifier(random_state=24)
rf = RandomForestClassifier(random_state=24)
stack = StackingClassifier([('LR', lr), ('SVM', svm), ('TREE', dtc)], final_estimator=rf)


In [17]:
stack.fit(X_train, y_train)


In [18]:
y_pred = stack.predict(X_test)
print(accuracy_score(y_test, y_pred))

y_pred_proba = stack.predict_proba(X_test)
print(log_loss(y_test, y_pred_proba))

0.72
2.001191391953967


#### with pass-through

In [19]:
stack = StackingClassifier([('LR', lr), ('SVM', svm), ('TREE', dtc)], final_estimator=rf, passthrough=True)


In [20]:
stack.fit(X_train, y_train)


In [21]:
y_pred = stack.predict(X_test)
print(accuracy_score(y_test, y_pred))

y_pred_proba = stack.predict_proba(X_test)
print(log_loss(y_test, y_pred_proba))

0.76
1.8078240247569928


In [22]:
print(stack.get_params())


{'cv': None, 'estimators': [('LR', LogisticRegression()), ('SVM', SVC(kernel='linear', probability=True, random_state=24)), ('TREE', DecisionTreeClassifier(random_state=24))], 'final_estimator__bootstrap': True, 'final_estimator__ccp_alpha': 0.0, 'final_estimator__class_weight': None, 'final_estimator__criterion': 'gini', 'final_estimator__max_depth': None, 'final_estimator__max_features': 'sqrt', 'final_estimator__max_leaf_nodes': None, 'final_estimator__max_samples': None, 'final_estimator__min_impurity_decrease': 0.0, 'final_estimator__min_samples_leaf': 1, 'final_estimator__min_samples_split': 2, 'final_estimator__min_weight_fraction_leaf': 0.0, 'final_estimator__monotonic_cst': None, 'final_estimator__n_estimators': 100, 'final_estimator__n_jobs': None, 'final_estimator__oob_score': False, 'final_estimator__random_state': 24, 'final_estimator__verbose': 0, 'final_estimator__warm_start': False, 'final_estimator': RandomForestClassifier(random_state=24), 'n_jobs': None, 'passthrough

In [23]:
params = {'LR__C': np.linspace(0.01, 3, 5), 
          'SVM__C': np.linspace(0.01, 3, 5),
          'TREE__max_depth': [None, 3, 2, 4],
          'final_estimator__max_features': [2, 3],
          'passthrough': [False, True]}

In [24]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

gcv = GridSearchCV(stack, param_grid=params, cv=kfold, 
                   scoring='neg_log_loss', n_jobs=-1)
gcv.fit(X, y)

print(gcv.best_score_)
print(gcv.best_params_)

-0.41213495226387675
{'LR__C': 1.5050000000000001, 'SVM__C': 0.7575000000000001, 'TREE__max_depth': 4, 'final_estimator__max_features': 3, 'passthrough': True}
