In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegression, ElasticNet, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, VotingRegressor, BaggingClassifier

import warnings
warnings.filterwarnings('ignore')

In [41]:
glass.head(3)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed


In [5]:
glass = pd.read_csv("Glass.csv")
x = glass.drop('Type', axis = 1)
y = glass['Type']
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    random_state = 24, 
                                                    test_size = 0.3,
                                                   stratify = y,
                                                   )

In [7]:
lr = LogisticRegression(random_state = 24)
bagg = BaggingClassifier(estimator = lr, random_state = 24)
bagg.fit(x_train, y_train)
y_pred = bagg.predict(x_test)
accuracy_score(y_test, y_pred)

0.6461538461538462

In [9]:
svm = SVC(probability = True, random_state=24)
bagg = BaggingClassifier(estimator = svm, random_state = 24)
bagg.fit(x_train, y_train)
y_pred = bagg.predict(x_test)
accuracy_score(y_test, y_pred)

0.6153846153846154

In [11]:
lr = LogisticRegression(random_state = 24)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy_score(y_test, y_pred)

0.6461538461538462

In [13]:
svm = SVC(probability = True, random_state=24)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)
accuracy_score(y_test, y_pred)


0.35384615384615387

In [15]:
dtc = DecisionTreeClassifier(random_state = 24)
bagg = BaggingClassifier(estimator = dtc, random_state = 24)
bagg.fit(x_train, y_train)
y_pred = bagg.predict(x_test)
accuracy_score(y_test, y_pred)


0.6615384615384615

In [17]:
dtc = DecisionTreeClassifier(random_state = 24)
dtc.fit(x_train, y_train)
y_pred = dtc.predict(x_test)
accuracy_score(y_test, y_pred)

0.676923076923077

In [43]:
# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

dtc = DecisionTreeClassifier()
lr = LogisticRegression()
svm = SVC()

# vor = VotingRegressor([("DTC", dtc), ("LR", lr), ("SVM", svm)])

bagg = BaggingClassifier(random_state = 24)

pipe = Pipeline([("OHE", trans_ohe), ("SCL", scl_mm), ("BAG", bagg)])


# _____________________________________________________________________________________
# GCV

params = {
    "BAG__estimator" : [svm, lr, dtc],
    "BAG__n_estimators" : [10, 50, 100]
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "neg_log_loss",
                  cv = kfolds,
                  verbose = 3)


# gcv.fit(x, y)

In [45]:
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END BAG__estimator=SVC(), BAG__n_estimators=10;, score=-6.401 total time=   0.0s
[CV 2/5] END BAG__estimator=SVC(), BAG__n_estimators=10;, score=-3.911 total time=   0.0s
[CV 3/5] END BAG__estimator=SVC(), BAG__n_estimators=10;, score=-6.494 total time=   0.0s
[CV 4/5] END BAG__estimator=SVC(), BAG__n_estimators=10;, score=-7.625 total time=   0.0s
[CV 5/5] END BAG__estimator=SVC(), BAG__n_estimators=10;, score=-7.830 total time=   0.0s
[CV 1/5] END BAG__estimator=SVC(), BAG__n_estimators=50;, score=-4.270 total time=   0.1s
[CV 2/5] END BAG__estimator=SVC(), BAG__n_estimators=50;, score=-3.956 total time=   0.1s
[CV 3/5] END BAG__estimator=SVC(), BAG__n_estimators=50;, score=-6.458 total time=   0.1s
[CV 4/5] END BAG__estimator=SVC(), BAG__n_estimators=50;, score=-5.411 total time=   0.1s
[CV 5/5] END BAG__estimator=SVC(), BAG__n_estimators=50;, score=-4.529 total time=   0.1s
[CV 1/5] END BAG__estimator=SVC(), BAG__

In [47]:
gcv.best_params_

{'BAG__estimator': DecisionTreeClassifier(), 'BAG__n_estimators': 100}

In [49]:
gcv.best_score_

-0.7425134701047924

In [77]:
# Trying different parameters of individual models
kfold = StratifiedKFold(n_splits = 5, 
                        shuffle=True, 
                        random_state=24)



dtc = DecisionTreeClassifier()

bagg = BaggingClassifier(estimator = dtc, 
                         random_state = 24)

params = {
    'n_estimators' : [10, 50, 75],
         'estimator__max_depth' : [None, 3],
         'estimator__min_samples_split' : [2, 10],
         'estimator__min_samples_leaf' : [1, 10]}

gcv = GridSearchCV(bagg, 
                   param_grid=params, 
                   cv=kfold, 
                   scoring='neg_log_loss', 
                  verbose = 3)



In [79]:
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END estimator__max_depth=None, estimator__min_samples_leaf=1, estimator__min_samples_split=2, n_estimators=10;, score=-2.933 total time=   0.0s
[CV 2/5] END estimator__max_depth=None, estimator__min_samples_leaf=1, estimator__min_samples_split=2, n_estimators=10;, score=-0.535 total time=   0.0s
[CV 3/5] END estimator__max_depth=None, estimator__min_samples_leaf=1, estimator__min_samples_split=2, n_estimators=10;, score=-1.888 total time=   0.0s
[CV 4/5] END estimator__max_depth=None, estimator__min_samples_leaf=1, estimator__min_samples_split=2, n_estimators=10;, score=-0.743 total time=   0.0s
[CV 5/5] END estimator__max_depth=None, estimator__min_samples_leaf=1, estimator__min_samples_split=2, n_estimators=10;, score=-1.896 total time=   0.0s
[CV 1/5] END estimator__max_depth=None, estimator__min_samples_leaf=1, estimator__min_samples_split=2, n_estimators=50;, score=-2.914 total time=   0.1s
[CV 2/5] END estimat

In [81]:
gcv.best_score_

-0.7312731358992163

In [83]:
gcv.best_params_

{'estimator__max_depth': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'n_estimators': 75}

In [103]:
# Trying different parameters of individual models
kfold = StratifiedKFold(n_splits = 5, 
                        shuffle=True, 
                        random_state=24)



svm = SVC(probability = True)

bagg = BaggingClassifier(estimator = svm, 
                         random_state = 24)

params = {
    'n_estimators' : [10, 50, 100],
         'estimator__C' : np.linspace(0.001, 3, 5)}

gcv = GridSearchCV(bagg, 
                   param_grid=params, 
                   cv=kfold, 
                   scoring='neg_log_loss', 
                  verbose = 3)



In [105]:
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END estimator__C=0.001, n_estimators=10;, score=-1.111 total time=   0.0s
[CV 2/5] END estimator__C=0.001, n_estimators=10;, score=-1.111 total time=   0.0s
[CV 3/5] END estimator__C=0.001, n_estimators=10;, score=-1.134 total time=   0.0s
[CV 4/5] END estimator__C=0.001, n_estimators=10;, score=-1.217 total time=   0.0s
[CV 5/5] END estimator__C=0.001, n_estimators=10;, score=-1.197 total time=   0.0s
[CV 1/5] END .estimator__C=0.001, n_estimators=50;, score=nan total time=   0.2s
[CV 2/5] END estimator__C=0.001, n_estimators=50;, score=-1.130 total time=   0.2s
[CV 3/5] END estimator__C=0.001, n_estimators=50;, score=-1.139 total time=   0.2s
[CV 4/5] END estimator__C=0.001, n_estimators=50;, score=-1.226 total time=   0.2s
[CV 5/5] END .estimator__C=0.001, n_estimators=50;, score=nan total time=   0.2s
[CV 1/5] END estimator__C=0.001, n_estimators=100;, score=nan total time=   0.5s
[CV 2/5] END estimator__C=0.001,

In [107]:
gcv.best_score_

-1.087130220471035

In [109]:
gcv.best_params_

{'estimator__C': 1.5005, 'n_estimators': 10}