In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, log_loss, accuracy_score
from sklearn.model_selection import train_test_split




In [2]:
df = pd.read_csv('../Datasets/cases/Sonar/Sonar.csv')
X = df.drop('Class', axis=1)
y = df['Class']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

If we do not specify estimator, default is DecisionTreeClassifier

In [3]:
dtc = DecisionTreeClassifier(random_state=6969)
bagging = BaggingClassifier(random_state=6969 ,estimator=dtc, n_estimators=10, max_samples=100)
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)


In [4]:
print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(log_loss(y_test, y_pred))
print(classification_report(y_test,y_pred))

0.5777777777777777
0.6984126984126984
10.870308164971838
              precision    recall  f1-score   support

           0       0.66      0.91      0.77        34
           1       0.81      0.45      0.58        29

    accuracy                           0.70        63
   macro avg       0.74      0.68      0.67        63
weighted avg       0.73      0.70      0.68        63



USING LOGISTIC REGRESSION

In [5]:
from sklearn.linear_model import LogisticRegression


In [6]:
lr = LogisticRegression(C=0.5)
bagging = BaggingClassifier(random_state=25, estimator=lr, n_estimators=10)

bagging.fit(X_train,y_train)
y_pred = bagging.predict(X_test)

In [7]:
print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(log_loss(y_test, y_pred))
print(classification_report(y_test,y_pred))

0.7169811320754716
0.7619047619047619
8.581822235504085
              precision    recall  f1-score   support

           0       0.74      0.85      0.79        34
           1       0.79      0.66      0.72        29

    accuracy                           0.76        63
   macro avg       0.77      0.75      0.76        63
weighted avg       0.77      0.76      0.76        63



# Using different estimators

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tqdm import tqdm

lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=3)
svc1 = SVC(kernel='linear', C=0.5)
svc2 = SVC(kernel='rbf', gamma=0.5, C=0.5)

est_list = [lr, knn, dtc, svc1, svc2]
n_est = [10, 15, 25, 50]
scores = []

for e in tqdm(est_list):
    for n in n_est:
        bagg = BaggingClassifier(estimator=e, n_estimators=n, random_state=25)
        bagg.fit(X_train, y_train)
        y_pred_proba = bagg.predict_proba(X_test)
        y_pred = bagg.predict(X_test)

        scores.append([e,n,log_loss(y_test,y_pred_proba)])

scores = pd.DataFrame(scores, columns=['Estimator name', 'Sample size', 'Log loss'])
scores.sort_values('Log loss')

100%|██████████| 5/5 [00:09<00:00,  1.89s/it]


Unnamed: 0,Estimator name,Sample size,Log loss
4,KNeighborsClassifier(n_neighbors=3),10,0.404262
6,KNeighborsClassifier(n_neighbors=3),25,0.413694
7,KNeighborsClassifier(n_neighbors=3),50,0.416579
5,KNeighborsClassifier(n_neighbors=3),15,0.420578
19,"SVC(C=0.5, gamma=0.5)",50,0.448073
18,"SVC(C=0.5, gamma=0.5)",25,0.449508
10,DecisionTreeClassifier(random_state=6969),25,0.47575
8,DecisionTreeClassifier(random_state=6969),10,0.477146
11,DecisionTreeClassifier(random_state=6969),50,0.490503
9,DecisionTreeClassifier(random_state=6969),15,0.50212


#### BOOTSTRAP IS FALSE(no repeition) AND IF WE DO NOT USE MAX SAMPLES, THEN WE EXPECT NO DIFFERENCE IN ONE ESTIMATOR REGARDLESS OF n_estimators.
IF we use max_samples, then there will be differences.

In [9]:
# WITHOUT MAX_SAMPLE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tqdm import tqdm

lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=3)
svc1 = SVC(kernel='linear', C=0.5)
svc2 = SVC(kernel='rbf', gamma=0.5, C=0.5)

est_list = [lr, knn, dtc, svc1, svc2]
n_est = [10, 15, 25, 50]
scores = []

for e in tqdm(est_list):
    for n in n_est:
        bagg = BaggingClassifier(estimator=e, n_estimators=n, random_state=25, bootstrap=False)
        bagg.fit(X_train, y_train)
        y_pred_proba = bagg.predict_proba(X_test)
        y_pred = bagg.predict(X_test)

        scores.append([e,n,log_loss(y_test,y_pred_proba)])

scores = pd.DataFrame(scores, columns=['Estimator name', 'Sample size', 'Log loss'])
scores.sort_values('Log loss')

100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Unnamed: 0,Estimator name,Sample size,Log loss
0,LogisticRegression(),10,0.53601
1,LogisticRegression(),15,0.53601
2,LogisticRegression(),25,0.53601
3,LogisticRegression(),50,0.53601
7,KNeighborsClassifier(n_neighbors=3),50,2.537403
4,KNeighborsClassifier(n_neighbors=3),10,2.537403
5,KNeighborsClassifier(n_neighbors=3),15,2.537403
6,KNeighborsClassifier(n_neighbors=3),25,2.537403
8,DecisionTreeClassifier(random_state=6969),10,6.414374
9,DecisionTreeClassifier(random_state=6969),15,6.43731


In [10]:
# WITH MAX_SAMPLE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tqdm import tqdm

lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=3)
svc1 = SVC(kernel='linear', C=0.5)
svc2 = SVC(kernel='rbf', gamma=0.5, C=0.5)

est_list = [lr, knn, dtc, svc1, svc2]
n_est = [10, 15, 25, 50]
scores = []

for e in tqdm(est_list):
    for n in n_est:
        bagg = BaggingClassifier(estimator=e, n_estimators=n, random_state=25, bootstrap=False, max_samples=100)
        bagg.fit(X_train, y_train)
        y_pred_proba = bagg.predict_proba(X_test)
        y_pred = bagg.predict(X_test)

        scores.append([e,n,log_loss(y_test,y_pred_proba)])

scores = pd.DataFrame(scores, columns=['Estimator name', 'Sample size', 'Log loss'])
scores.sort_values('Log loss')

100%|██████████| 5/5 [00:04<00:00,  1.21it/s]


Unnamed: 0,Estimator name,Sample size,Log loss
7,KNeighborsClassifier(n_neighbors=3),50,0.420693
6,KNeighborsClassifier(n_neighbors=3),25,0.428393
5,KNeighborsClassifier(n_neighbors=3),15,0.433164
4,KNeighborsClassifier(n_neighbors=3),10,0.466018
11,DecisionTreeClassifier(random_state=6969),50,0.475995
3,LogisticRegression(),50,0.537733
2,LogisticRegression(),25,0.539222
1,LogisticRegression(),15,0.545824
10,DecisionTreeClassifier(random_state=6969),25,0.54843
0,LogisticRegression(),10,0.548876


best model with bootstrap true and tuning on the best model (KNN)

In [16]:
import numpy as np

neighbors = np.arange(1,11)
scores = []

for n in tqdm(neighbors):

    bagg = BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=n), 
                            n_estimators=10, random_state=25)
    bagg.fit(X_train, y_train)
    y_pred_proba = bagg.predict_proba(X_test)
    y_pred = bagg.predict(X_test)
    scores.append([n, log_loss(y_test, y_pred_proba)])

df_scores = pd.DataFrame(scores, columns=['neighbors', 'log_loss'])
df_scores.sort_values('log_loss')



100%|██████████| 10/10 [00:01<00:00,  8.22it/s]


Unnamed: 0,neighbors,log_loss
3,4,0.399824
2,3,0.404262
4,5,0.422282
1,2,0.435818
5,6,0.457933
6,7,0.482529
7,8,0.504921
8,9,0.516924
9,10,0.524875
0,1,2.518928


### OOB Score
What Does the OOB Score Represent?

The OOB Score is an estimate of the model's generalization accuracy—that is, how well it is expected to perform on unseen data.

Here's the process:

    For each individual tree in the forest, take its OOB samples (the data it never saw during training) and run them through that specific tree to get predictions.

    Aggregate the Results: For each data point in the training set, collect all the predictions from only the trees for which that data point was OOB.

    Calculate the Score:

        For classification (like RandomForestClassifier in scikit-learn), the OOB Score is the mean accuracy of these aggregated OOB predictions.

        For regression (like RandomForestRegressor), the OOB Score is the R² score (the coefficient of determination) of these aggregated OOB predictions.

In [22]:
bagg = BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=n), 
                            n_estimators=10, random_state=25, oob_score=True)
bagg.fit(X_train, y_train)
y_pred_proba = bagg.predict_proba(X_test)
y_pred = bagg.predict(X_test)
print("OOB score = ", bagg.oob_score_)




OOB score =  0.7103448275862069
