In [22]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import f1_score, classification_report, log_loss
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [23]:
df = pd.read_csv('../Datasets/cases/Sonar/Sonar.csv')
X = df.drop('Class', axis=1)
y = df['Class']
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

dtc = DecisionTreeClassifier(random_state=23, max_depth=1)
nb = GaussianNB()
lr = LogisticRegression()


## With max depth one, using only one estimator

In [25]:

m_dept = [i for i in range(1,20)]
n_est = [10,15,25,50,75,100]
scores = []

for n in tqdm(n_est):
    for m in m_dept:
        dtc = DecisionTreeClassifier(random_state=23, max_depth=1)
        ada = AdaBoostClassifier(random_state=23, estimator=dtc, n_estimators=n)
        ada.fit(X_train, y_train)
        y_pred_proba = ada.predict_proba(X_test)
        scores.append([n, 1, log_loss(y_test, y_pred_proba)])


scores = pd.DataFrame(scores, columns=['Estimators', 'Max Depth', 'Log loss'])
scores.sort_values('Log loss', ascending=True)


100%|██████████| 6/6 [00:18<00:00,  3.10s/it]


Unnamed: 0,Estimators,Max Depth,Log loss
0,10,1,0.539927
1,10,1,0.539927
2,10,1,0.539927
3,10,1,0.539927
4,10,1,0.539927
...,...,...,...
109,100,1,0.580572
110,100,1,0.580572
111,100,1,0.580572
112,100,1,0.580572


## Using multiple estimators

In [26]:

m_dept = [i for i in range(1,20)]
n_est = [10,15,25,50,75,100,1000]
ests = [dtc,nb,lr]
scores = []

for n in tqdm(n_est):
    for est in ests:
        ada = AdaBoostClassifier(random_state=23, estimator=est, n_estimators=n)
        ada.fit(X_train, y_train)
        y_pred_proba = ada.predict_proba(X_test)
        scores.append([n, est, log_loss(y_test, y_pred_proba)])


scores = pd.DataFrame(scores, columns=['Estimators', 'Estimators', 'Log loss'])
scores.sort_values('Log loss', ascending=True)


100%|██████████| 7/7 [00:06<00:00,  1.05it/s]


Unnamed: 0,Estimators,Estimators.1,Log loss
10,50,GaussianNB(),0.497604
13,75,GaussianNB(),0.501961
16,100,GaussianNB(),0.506158
7,25,GaussianNB(),0.506931
19,1000,GaussianNB(),0.516193
1,10,GaussianNB(),0.52072
11,50,LogisticRegression(),0.537005
14,75,LogisticRegression(),0.537005
20,1000,LogisticRegression(),0.537005
17,100,LogisticRegression(),0.537005
