# Train Set Operation

In [1]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, roc_auc_score, log_loss
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

In [31]:
cancer = pd.read_csv('BreastCancer.csv',index_col=0)
cancer

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign
...,...,...,...,...,...,...,...,...,...,...
1369821,10,10,10,10,5,10,10,10,7,Malignant
1371026,5,10,10,10,4,10,5,6,3,Malignant
1371920,5,1,1,1,2,1,3,2,1,Benign
8233704,4,1,1,1,1,1,2,1,1,Benign


In [2]:
cancer = pd.read_csv('BreastCancer.csv',index_col=0)
X = cancer.drop(columns='Class')
y = cancer.Class
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, stratify=y,test_size=0.3)

In [3]:
knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=24)
stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)], 
                          final_estimator=log)

In [4]:
stack.fit(X_train,y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9714285714285714


In [5]:
y_pred_proba = stack.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test,y_pred_proba))

0.990841384863124


In [6]:
# Using Passthrough

knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=24)
stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)], 
                          final_estimator=log, passthrough=True, verbose=0)

In [7]:
stack.fit(X_train,y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.9714285714285714


In [8]:
y_pred_proba = stack.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test,y_pred_proba))

0.9959742351046699


# Glass Dataset

In [9]:
glass = pd.read_csv('Glass.csv')
glass

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,building_windows_float_processed
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,building_windows_float_processed
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,headlamps
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,headlamps
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,headlamps
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,headlamps


In [10]:
X = glass.drop(columns='Type')
y = glass.Type
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, stratify=y,test_size=0.3)

In [11]:
# Without Passthrough
knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=24)

stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)], 
                          final_estimator=log)

In [12]:
stack.fit(X_train,y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7230769230769231


In [13]:
y_pred_proba = stack.predict_proba(X_test)
print(log_loss(y_test,y_pred_proba))

0.7261085625838181


In [14]:
# With Passthrough
knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=24)
stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)], 
                          final_estimator=log, passthrough=True)

In [15]:
stack.fit(X_train,y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7076923076923077


In [16]:
y_pred_proba = stack.predict_proba(X_test)
print(log_loss(y_test,y_pred_proba))

0.774782003616886


In [17]:
# With different final estimator

knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
rf = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=20)
kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=24)

stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)], 
                          final_estimator=rf)

In [18]:
stack.fit(X_train,y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.6461538461538462


In [19]:
y_pred_proba = stack.predict_proba(X_test)
print(log_loss(y_test,y_pred_proba))

0.8934664102357298


In [20]:
# Using Passthrough with different(randomforest) final estimator

knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
rf = RandomForestClassifier()
kfold = StratifiedKFold(n_splits=5, shuffle=True,random_state=24)

stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)], 
                          final_estimator=rf, passthrough=True, verbose=0)

In [21]:
stack.fit(X_train,y_train)
y_pred = stack.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.7230769230769231


In [22]:
y_pred_proba = stack.predict_proba(X_test)
print(log_loss(y_test,y_pred_proba))

1.2247038155619567


In [23]:
# Using GCV

knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier(random_state=24)
svm = SVC(probability=True, random_state=24)
nb = GaussianNB()
log = LogisticRegression(random_state=24)
rf = RandomForestClassifier(random_state=24)
kfold = StratifiedKFold(n_splits=5, shuffle=True,random_state=24)

stack = StackingClassifier(estimators=[('dtc',dtc),('svm',svm),('knn',knn),('nb',nb)],final_estimator=rf)
stack.get_params()

{'cv': None,
 'estimators': [('dtc', DecisionTreeClassifier(random_state=24)),
  ('svm', SVC(probability=True, random_state=24)),
  ('knn', KNeighborsClassifier()),
  ('nb', GaussianNB())],
 'final_estimator__bootstrap': True,
 'final_estimator__ccp_alpha': 0.0,
 'final_estimator__class_weight': None,
 'final_estimator__criterion': 'gini',
 'final_estimator__max_depth': None,
 'final_estimator__max_features': 'sqrt',
 'final_estimator__max_leaf_nodes': None,
 'final_estimator__max_samples': None,
 'final_estimator__min_impurity_decrease': 0.0,
 'final_estimator__min_samples_leaf': 1,
 'final_estimator__min_samples_split': 2,
 'final_estimator__min_weight_fraction_leaf': 0.0,
 'final_estimator__monotonic_cst': None,
 'final_estimator__n_estimators': 100,
 'final_estimator__n_jobs': None,
 'final_estimator__oob_score': False,
 'final_estimator__random_state': 24,
 'final_estimator__verbose': 0,
 'final_estimator__warm_start': False,
 'final_estimator': RandomForestClassifier(random_state

In [24]:
params = {'dtc__max_depth':[3,5,None],
          'final_estimator__max_depth': [3,5, None],
          'passthrough':[True,False],
          'svm__C': np.linspace(0.001,3,5)}
gcv = GridSearchCV(stack, param_grid=params, cv=kfold, scoring='neg_log_loss', verbose=3)
gcv.fit(X,y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.001;, score=-0.766 total time=   0.2s
[CV 2/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.001;, score=-0.870 total time=   0.3s
[CV 3/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.001;, score=-0.873 total time=   0.2s
[CV 4/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.001;, score=-0.878 total time=   0.2s
[CV 5/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.001;, score=-0.954 total time=   0.2s
[CV 1/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.75075;, score=-0.770 total time=   0.3s
[CV 2/5] END dtc__max_depth=3, final_estimator__max_depth=3, passthrough=True, svm__C=0.75075;, score=-0.842 total time=   0.2s
[CV 3/5] END dtc__max_depth=3, final_estimator__max_

In [25]:
pd_cv = pd.DataFrame(gcv.cv_results_)
pd_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dtc__max_depth,param_final_estimator__max_depth,param_passthrough,param_svm__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.312999,0.024812,0.0134,0.000489,3.0,3.0,True,0.001,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.765846,-0.870271,-0.872911,-0.87828,-0.954437,-0.868349,0.060069,32
1,0.303761,0.022624,0.011799,0.000748,3.0,3.0,True,0.75075,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.769784,-0.841733,-0.891599,-0.886229,-0.920041,-0.861877,0.052434,30
2,0.279123,0.006467,0.0122,0.00098,3.0,3.0,True,1.5005,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.759323,-0.845319,-0.890372,-0.893984,-0.918454,-0.86149,0.056275,29
3,0.276089,0.002661,0.011722,0.00075,3.0,3.0,True,2.25025,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.765791,-0.851171,-0.877625,-0.874732,-0.929391,-0.859742,0.053471,28
4,0.287139,0.013083,0.012199,0.001165,3.0,3.0,True,3.0,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.770461,-0.84923,-0.873051,-0.877874,-0.92647,-0.859417,0.051068,27
5,0.306124,0.011169,0.012203,0.001162,3.0,3.0,False,0.001,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.815773,-0.875957,-0.885334,-0.868056,-0.999799,-0.888984,0.060433,42
6,0.30502,0.016489,0.013251,0.00116,3.0,3.0,False,0.75075,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.839357,-0.87014,-0.901629,-0.876828,-0.967477,-0.891086,0.043046,44
7,0.303408,0.011043,0.013523,0.00045,3.0,3.0,False,1.5005,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.814123,-0.869912,-0.903486,-0.887178,-0.977898,-0.89052,0.053057,43
8,0.325925,0.019873,0.013002,0.000892,3.0,3.0,False,2.25025,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.812801,-0.877768,-0.904137,-0.860258,-0.977342,-0.886461,0.054332,39
9,0.332056,0.014161,0.013204,0.001476,3.0,3.0,False,3.0,"{'dtc__max_depth': 3, 'final_estimator__max_de...",-0.80916,-0.887846,-0.900459,-0.853867,-0.980596,-0.886386,0.05674,38


In [26]:
print("Best score: ",gcv.best_params_)
print("Best Score: ", gcv.best_score_)

Best score:  {'dtc__max_depth': None, 'final_estimator__max_depth': None, 'passthrough': True, 'svm__C': 3.0}
Best Score:  -0.7309328491569549


# Pickle Demo

In [29]:
best_stack = gcv.best_estimator_

In [30]:
import pickle
pkfile = open('stack_gls.pkl','wb')
pickle.dump(best_stack,pkfile)