In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegression, ElasticNet, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, VotingRegressor, BaggingClassifier, BaggingRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor, StackingClassifier


from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [25]:
df = pd.read_csv('Breastcancer.csv', index_col= 0)
df.head(3)

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant


In [27]:
x, y = df.drop("Class", axis = 1), df["Class"]



In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    random_state= 24,
                                                    test_size= 0.3,
                                                    stratify= y)


In [31]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state = 24)
svm = SVC(
    probability= True,
    random_state = 24
)
lr = LogisticRegression( random_state = 24)

stack = StackingClassifier(
    estimators= [("KNN", knn), ("NB", nb), ("TREE", dtc), ("SVM", svm),],
    final_estimator= lr
)



In [33]:
stack.fit(x_train, y_train)
y_pred = stack.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9714285714285714


In [35]:
y_pred_prob = stack.predict_proba(x_test)[:, 1]
print(roc_auc_score(y_test, y_pred_prob))

0.990841384863124


#### Same code using passthrough option

In [38]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state = 24)
svm = SVC(
    probability= True,
    random_state = 24
)
lr = LogisticRegression( random_state = 24)

stack = StackingClassifier(
    estimators= [("KNN", knn), ("NB", nb), ("TREE", dtc), ("SVM", svm),],
    final_estimator= lr,
    passthrough= True          # Passthrough = True : final estimator will be trained on original data as well as on predicted values of estimators
)


In [40]:
stack.fit(x_train, y_train)
y_pred = stack.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9714285714285714


In [42]:
y_pred_prob = stack.predict_proba(x_test)[:, 1]
print(roc_auc_score(y_test, y_pred_prob))

0.9959742351046699


# Glass Identification Dataset

In [47]:
df = pd.read_csv("Glass.csv")
df.head(3)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed


In [49]:
x, y = df.drop("Type", axis = 1), df["Type"]

In [51]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    random_state = 24,
                                                    test_size= 0.3,
                                                    stratify= y)


In [255]:


# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state = 24)
svm = SVC(
    probability= True,
    random_state = 24
)

rmf = RandomForestClassifier(
    random_state = 24,
    n_estimators = 400,
)

stack = StackingClassifier(
    estimators= [("KNN", knn), ("NB", nb), ("TREE", dtc), ("SVM", svm),],
    final_estimator= rmf,
    passthrough= True        
)

pipe = Pipeline([("OHE", trans_ohe), 
                 ("SCL", scl_mm), 
                 ("STK", stack)
                ])


# _____________________________________________________________________________________
# GCV

params = {
    # "STK__final_estimator__max_depth" : [3, 4, 5],
    # "STK__SVM__C" : np.linspace(0.001, 3, 5),
    # "STK__TREE__max_depth" : [None, 2, 4],
    # "STK__passthrough" : [True, False]
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "neg_log_loss",
                  cv = kfolds,
                  verbose = 3)


# gcv.fit(x, y)

In [257]:
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END .................................., score=-0.853 total time=   0.7s
[CV 2/5] END .................................., score=-0.688 total time=   0.7s
[CV 3/5] END .................................., score=-0.745 total time=   0.8s
[CV 4/5] END .................................., score=-0.701 total time=   0.7s
[CV 5/5] END .................................., score=-0.746 total time=   0.7s


In [259]:
print(gcv.best_score_)
print(gcv.best_params_)

-0.7464631025606245
{}


In [261]:
y_pred = gcv.predict_proba(x_test)

In [263]:
print(log_loss(y_test, y_pred))

0.732151197947801


In [265]:
best_stack = gcv.best_estimator_

In [269]:
import pickle
pkfile = open("stack_gls.pkl", "wb")
pickle.dump(best_stack, pkfile)