In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv(r'D:\Datasets\glass.csv')
data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
data.shape

(214, 10)

In [4]:
data.size

2140

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
 9   Type    214 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 16.8 KB


In [6]:
data['Type'].value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [7]:
data['Type'] = np.where(data['Type']<3,0,1)
data['Type'].value_counts()

0    146
1     68
Name: Type, dtype: int64

In [8]:
x = data.drop('Type',axis=1)
y = data['Type']

In [9]:
from imblearn.over_sampling import SMOTE
s = SMOTE()
x, y = s.fit_resample(x,y)
print(x.shape, y.shape)

(292, 9) (292,)


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state=101)

In [11]:
from sklearn.ensemble import BaggingClassifier
bgc = BaggingClassifier()
bgc.fit(x_train, y_train)

BaggingClassifier()

In [12]:
y_pred = bgc.predict(x_test)

In [13]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8493150684931506
[[30  4]
 [ 7 32]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.85        34
           1       0.89      0.82      0.85        39

    accuracy                           0.85        73
   macro avg       0.85      0.85      0.85        73
weighted avg       0.85      0.85      0.85        73



In [14]:
bgc.score(x_train, y_train)

0.9908675799086758

# Manual HyperParameter Tuning

### 1 - base_estimtor (default = None (Decision Tree Classifier))

In [15]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()

In [17]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [18]:
bgc1 = BaggingClassifier(base_estimator=dtc)
bgc1.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier())

In [19]:
bgc1.score(x_test, y_test)

0.8493150684931506

In [20]:
bgc1.score(x_train, y_train)

0.9954337899543378

#### setting base_estimator as KNeighbors Classifier

In [21]:
bgc_knc = BaggingClassifier(base_estimator=knc)
bgc_knc.fit(x_train ,y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier())

In [22]:
y_pred_knc = bgc_knc.predict(x_test)

In [23]:
print(metrics.accuracy_score(y_test, y_pred_knc))
print(metrics.confusion_matrix(y_test, y_pred_knc))
print(metrics.classification_report(y_test, y_pred_knc))

0.9041095890410958
[[31  3]
 [ 4 35]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90        34
           1       0.92      0.90      0.91        39

    accuracy                           0.90        73
   macro avg       0.90      0.90      0.90        73
weighted avg       0.90      0.90      0.90        73



In [24]:
bgc_knc.score(x_train, y_train)

0.8995433789954338

#### Setting base_estimator as Logistic Regression 

In [25]:
bgc_logreg = BaggingClassifier(base_estimator=logreg)
bgc_logreg.fit(x_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression())

In [26]:
y_pred_logreg = bgc_logreg.predict(x_test)

In [27]:
print(metrics.accuracy_score(y_test, y_pred_logreg))
print(metrics.confusion_matrix(y_test, y_pred_logreg))
print(metrics.classification_report(y_test, y_pred_logreg))

0.7945205479452054
[[31  3]
 [12 27]]
              precision    recall  f1-score   support

           0       0.72      0.91      0.81        34
           1       0.90      0.69      0.78        39

    accuracy                           0.79        73
   macro avg       0.81      0.80      0.79        73
weighted avg       0.82      0.79      0.79        73



In [28]:
bgc_logreg.score(x_train, y_train)

0.817351598173516

### 2 - n_estimators (default=10)

In [29]:
bgc2 = BaggingClassifier(n_estimators=100)
bgc2.fit(x_train, y_train)

BaggingClassifier(n_estimators=100)

In [30]:
y_pred2 = bgc2.predict(x_test)

In [31]:
print(metrics.accuracy_score(y_test, y_pred2))
print(metrics.confusion_matrix(y_test, y_pred2))
print(metrics.classification_report(y_test, y_pred2))

0.863013698630137
[[31  3]
 [ 7 32]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86        34
           1       0.91      0.82      0.86        39

    accuracy                           0.86        73
   macro avg       0.87      0.87      0.86        73
weighted avg       0.87      0.86      0.86        73



In [32]:
bgc2.score(x_train, y_train)

1.0

### 3 - max_samples (default = 1)

In [33]:
bgc3 = BaggingClassifier(max_samples=0.75)
bgc3.fit(x_train, y_train)

BaggingClassifier(max_samples=0.75)

In [34]:
y_pred3 = bgc3.predict(x_test)

In [35]:
print(metrics.accuracy_score(y_test, y_pred3))
print(metrics.confusion_matrix(y_test, y_pred3))
print(metrics.classification_report(y_test, y_pred3))

0.8493150684931506
[[31  3]
 [ 8 31]]
              precision    recall  f1-score   support

           0       0.79      0.91      0.85        34
           1       0.91      0.79      0.85        39

    accuracy                           0.85        73
   macro avg       0.85      0.85      0.85        73
weighted avg       0.86      0.85      0.85        73



In [36]:
bgc3.score(x_train ,y_train)

0.9771689497716894

### 4 - bootstrap (default=True)

## In case of Bagging, the bootstrap parameter should always be set as True, If set to false then it classifier acts as Pasting

In [37]:
bgc4 = BaggingClassifier(bootstrap=True)
bgc4.fit(x_train, y_train)

BaggingClassifier()

In [38]:
y_pred4 = bgc4.predict(x_test)

In [39]:
print(metrics.accuracy_score(y_test, y_pred4))
print(metrics.confusion_matrix(y_test, y_pred4))
print(metrics.classification_report(y_test, y_pred4))

0.8493150684931506
[[30  4]
 [ 7 32]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.85        34
           1       0.89      0.82      0.85        39

    accuracy                           0.85        73
   macro avg       0.85      0.85      0.85        73
weighted avg       0.85      0.85      0.85        73



In [40]:
bgc4.score(x_train, y_train)

0.9954337899543378

### 5 - oob_score (default = False ) -- optional

In [41]:
bgc5 = BaggingClassifier(oob_score=True)
bgc5.fit(x_train, y_train)

BaggingClassifier(oob_score=True)

In [42]:
y_pred5 = bgc5.predict(x_test)

In [43]:
print(metrics.accuracy_score(y_test, y_pred5))
print(metrics.confusion_matrix(y_test, y_pred5))
print(metrics.classification_report(y_test, y_pred5))

0.821917808219178
[[28  6]
 [ 7 32]]
              precision    recall  f1-score   support

           0       0.80      0.82      0.81        34
           1       0.84      0.82      0.83        39

    accuracy                           0.82        73
   macro avg       0.82      0.82      0.82        73
weighted avg       0.82      0.82      0.82        73



In [44]:
bgc5.score(x_train, y_train)

0.9908675799086758

#### Let's check oob_score

OOB means 'Out-Of-Box'. It is referred to as oob samples. It means the samples/ (or just for simplicity) rows that are not considered while training the model. These are the left out samples.

In [45]:
bgc5.oob_score_

0.8447488584474886

# HyperParameter Tuning using Randomized Search CV

In [46]:
from sklearn.model_selection import RandomizedSearchCV

In [47]:
params = {
    'base_estimator' : [dtc, knc, logreg],
    'n_estimators'   : [int(i) for i in range(10,1000,100)],
    'max_samples'    : [float(i) for i in np.linspace(0.4,1.0,50)],
    'bootstrap'      : [True],
    'oob_score'      : [True] 
}

In [48]:
bgc6 = BaggingClassifier()
random_cv_bgcclf = RandomizedSearchCV(estimator=bgc6, param_distributions=params, n_iter=50,cv=5, n_jobs=7, verbose=True)
random_cv_bgcclf.fit(x_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   24.7s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  1.6min
[Parallel(n_jobs=7)]: Done 250 out of 250 | elapsed:  2.4min finished


RandomizedSearchCV(cv=5, estimator=BaggingClassifier(), n_iter=50, n_jobs=7,
                   param_distributions={'base_estimator': [DecisionTreeClassifier(),
                                                           KNeighborsClassifier(),
                                                           LogisticRegression()],
                                        'bootstrap': [True],
                                        'max_samples': [0.4, 0.4122448979591837,
                                                        0.4244897959183674,
                                                        0.43673469387755104,
                                                        0.4489795918367347,
                                                        0.4612244897959184,
                                                        0.47346938775510206,
                                                        0.485714285714...
                                                        0.5959183673469388,

In [49]:
best_params = random_cv_bgcclf.best_estimator_
best_params

BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  max_samples=0.889795918367347, n_estimators=210,
                  oob_score=True)

In [50]:
bgc_final = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  max_samples=0.889795918367347, n_estimators=210,
                  oob_score=True)
bgc_final.fit(x_train ,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  max_samples=0.889795918367347, n_estimators=210,
                  oob_score=True)

In [51]:
y_pred_final = bgc_final.predict(x_test)

In [52]:
print(metrics.accuracy_score(y_test, y_pred_final))
print(metrics.confusion_matrix(y_test, y_pred_final))
print(metrics.classification_report(y_test, y_pred_final))

0.8493150684931506
[[32  2]
 [ 9 30]]
              precision    recall  f1-score   support

           0       0.78      0.94      0.85        34
           1       0.94      0.77      0.85        39

    accuracy                           0.85        73
   macro avg       0.86      0.86      0.85        73
weighted avg       0.86      0.85      0.85        73



In [53]:
bgc_final.score(x_train ,y_train)

1.0

In [54]:
bgc_final.oob_score_

0.8858447488584474

# Yeah, I know it's an overfitting model, but our agenda was to know more about important hyperparameters. I am damn sure all our friends got the point!