#### voting

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier            # import for voting ensembling
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import f1_score, classification_report, log_loss
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv('../Datasets/cases/Sonar/Sonar.csv')
X = df.drop('Class', axis=1)
y = df['Class']
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

On training data: <br>
DecisionTreeClassifier.fit() <br>
KNeighborsClassifier.fit() <br>
GaussianNB.fit() <br>

but we will not fit 3 times, we will use VotingClassifier and pass the above 3 as estimators (it's like  a pipeline)


In [4]:
dtc1 = DecisionTreeClassifier(random_state=25, max_depth=10)
dtc2 = DecisionTreeClassifier(random_state=250)
knn = KNeighborsClassifier()
nb = GaussianNB()
voting = VotingClassifier(estimators=[
    ('Tree1', dtc1),
    ('Tree2', dtc2),
    ('KNN', knn),
    ('GaussianNB', nb)
]).set_output(transform='pandas')

In [5]:
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.88      0.75        34
           1       0.76      0.45      0.57        29

    accuracy                           0.68        63
   macro avg       0.71      0.67      0.66        63
weighted avg       0.70      0.68      0.66        63



Evaluating the individual estimators

In [6]:
voting.estimators_[0].predict(X_test)

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1])

In [7]:
estimators_df = pd.DataFrame({
    'Tree1': voting.estimators_[0].predict(X_test),
    'Tree2': voting.estimators_[1].predict(X_test),
    'KNN' : voting.estimators_[2].predict(X_test),
    'GaussianNB' : voting.estimators_[3].predict(X_test),
    'final_output':y_pred
})
estimators_df

Unnamed: 0,Tree1,Tree2,KNN,GaussianNB,final_output
0,1,1,1,0,1
1,0,0,0,0,0
2,0,0,1,1,0
3,1,1,1,0,1
4,0,0,1,1,0
...,...,...,...,...,...
58,1,1,1,1,1
59,1,1,1,1,1
60,0,0,1,0,0
61,0,0,0,0,0


In [8]:
from sklearn.metrics import accuracy_score

for i in range(len(voting.estimators_)):
    print('Estimator: ', voting.estimators_[i])
    print('Accuracy Score = ', accuracy_score(y_test, voting.estimators_[i].predict(X_test)))

Estimator:  DecisionTreeClassifier(max_depth=10, random_state=25)
Accuracy Score =  0.6984126984126984
Estimator:  DecisionTreeClassifier(random_state=250)
Accuracy Score =  0.7142857142857143
Estimator:  KNeighborsClassifier()
Accuracy Score =  0.746031746031746
Estimator:  GaussianNB()
Accuracy Score =  0.6349206349206349


#### What we did above is called (bohot) hard voting - used y_pred directly.

#### Now we will do soft voting - used y_pred_proba

# Soft Voting and weights

In [9]:
dt1 = DecisionTreeClassifier(random_state=25)
dt2 = DecisionTreeClassifier(random_state=250)
knn1 = KNeighborsClassifier(n_neighbors=3)
knn2 = KNeighborsClassifier()
gnb = GaussianNB()

# SOFT VOTING
voting = VotingClassifier(estimators=[
    ('tree1', dt1),
    ('tree2',dt2),
    ('knn1', knn1),
    ('knn2', knn2),
    ('gnb', gnb)
], voting='soft', weights=[4,4,9,8,4])

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.97      0.82        34
           1       0.94      0.55      0.70        29

    accuracy                           0.78        63
   macro avg       0.83      0.76      0.76        63
weighted avg       0.82      0.78      0.77        63



In [10]:
voting_df = pd.DataFrame({
    'tree1': voting.estimators_[0].predict(X_test),
    'tree2': voting.estimators_[1].predict(X_test),
    'knn1': voting.estimators_[2].predict(X_test),
    'knn2': voting.estimators_[3].predict(X_test),
    'gnb': voting.estimators_[4].predict(X_test),
    "Final": voting.predict(X_test)
})

voting_df

Unnamed: 0,tree1,tree2,knn1,knn2,gnb,Final
0,1,1,1,1,0,1
1,0,0,0,0,0,0
2,0,0,1,1,1,1
3,1,1,1,1,0,1
4,0,0,0,1,1,0
...,...,...,...,...,...,...
58,1,1,1,1,1,1
59,1,1,1,1,1,1
60,0,0,1,1,0,0
61,0,0,1,0,0,0


In [11]:

for i in range(len(voting.estimators_)):
    print('Estimator: ', voting.estimators_[i])
    print('Accuracy Score = ', accuracy_score(y_test, voting.estimators_[i].predict(X_test)))

Estimator:  DecisionTreeClassifier(random_state=25)
Accuracy Score =  0.6984126984126984
Estimator:  DecisionTreeClassifier(random_state=250)
Accuracy Score =  0.7142857142857143
Estimator:  KNeighborsClassifier(n_neighbors=3)
Accuracy Score =  0.8095238095238095
Estimator:  KNeighborsClassifier()


Accuracy Score =  0.746031746031746
Estimator:  GaussianNB()
Accuracy Score =  0.6349206349206349


### on HR dataset

In [39]:
df = pd.read_csv('../Datasets/cases/Human_Resources_Analytics/HR_comma_sep.csv')
y = df['left']
X = df.drop('left', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

# encoding on categorical and scaling on numerical

In [53]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

trf_scaling = make_column_transformer((StandardScaler(), make_column_selector(dtype_exclude=object)), 
                                  remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

trf_encoding = make_column_transformer((OneHotEncoder(sparse_output=False, drop='first'), make_column_selector(dtype_include=object)), 
                                       remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')


dt1 = DecisionTreeClassifier(random_state=25)
dt2 = DecisionTreeClassifier(random_state=250)
knn1 = KNeighborsClassifier(n_neighbors=3)
knn2 = KNeighborsClassifier(n_neighbors=8)
gnb = GaussianNB()

voting = VotingClassifier(estimators=[
    ('Tree1', dt1),
    ('Tree2', dt2),
    ('KNN1', knn1),
    ('KNN2', knn2),
    ('GaussianNB', gnb)
])

pipe = Pipeline(steps=[
    ('scaling', trf_scaling),
    ('encoding', trf_encoding),
    ('model_ensembling', voting)
])


In [54]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3429
           1       0.92      0.96      0.94      1070

    accuracy                           0.97      4499
   macro avg       0.95      0.97      0.96      4499
weighted avg       0.97      0.97      0.97      4499



#### evaluating individual model

In [55]:
trf_encoding.fit(X_train)
X_test_trf = trf_encoding.transform(X_test)

for i in range(len(voting.estimators_)):
    print('Estimator: ', voting.estimators_[i])
    print('Accuracy Score = ', accuracy_score(y_test, voting.estimators_[i].predict(X_test_trf)))

Estimator:  DecisionTreeClassifier(random_state=25)
Accuracy Score =  0.5672371638141809
Estimator:  DecisionTreeClassifier(random_state=250)
Accuracy Score =  0.6450322293843076
Estimator:  KNeighborsClassifier(n_neighbors=3)
Accuracy Score =  0.23783062902867305
Estimator:  KNeighborsClassifier(n_neighbors=8)
Accuracy Score =  0.23783062902867305
Estimator:  GaussianNB()
Accuracy Score =  0.23783062902867305


# SOft and WEights

In [43]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer

In [None]:
std_trf = make_column_transformer((StandardScaler(), make_column_selector(dtype_exclude=object)), 
                                  remainder='passthrough', 
                                  verbose_feature_names_out=False).set_output(transform='pandas')

enc_trf = make_column_transformer((OneHotEncoder(sparse_output=False, drop='first'), 
                                   make_column_selector(dtype_include=object)), 
                                   remainder='passthrough',
                                   verbose_feature_names_out=False).set_output(transform='pandas')

dt1 = DecisionTreeClassifier(random_state=25)
dt2 = DecisionTreeClassifier(random_state=25, max_depth=3)
knn1 = KNeighborsClassifier(n_neighbors=3)
knn2 = KNeighborsClassifier(n_neighbors=5)
gnb = GaussianNB()

voting = VotingClassifier([
    ('t1', dt1),
    ('t2', dt2),
    ('knn1', knn1),
    ('knn2', knn2),
    ('gnb', gnb)
], 
weights=[9,9,9,9,7],
voting='soft'
)

pipe = Pipeline([
#    ('Scaling', None),
    ('Encode', enc_trf),
    ('Ensemble voting', voting)
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))
print(log_loss(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.88      0.77        34
           1       0.79      0.52      0.62        29

    accuracy                           0.71        63
   macro avg       0.74      0.70      0.70        63
weighted avg       0.73      0.71      0.70        63

10.2981866826049


In [35]:
std_trf = make_column_transformer((StandardScaler(), make_column_selector(dtype_exclude=object)), 
                                  remainder='passthrough', 
                                  verbose_feature_names_out=False).set_output(transform='pandas')

enc_trf = make_column_transformer((OneHotEncoder(sparse_output=False, drop='first'), 
                                   make_column_selector(dtype_include=object)), 
                                   remainder='passthrough',
                                   verbose_feature_names_out=False).set_output(transform='pandas')

dt1 = DecisionTreeClassifier(random_state=25)
dt2 = DecisionTreeClassifier(random_state=25, max_depth=3)
knn1 = KNeighborsClassifier(n_neighbors=3)
knn2 = KNeighborsClassifier(n_neighbors=5)
gnb = GaussianNB()

voting = VotingClassifier([
    ('t1', dt1),
    ('t2', dt2),
    ('knn1', knn1),
    ('knn2', knn2),
    ('gnb', gnb)
], 
weights=[9,9,9,9,7],
voting='soft'
)

# pipe = Pipeline([
#     ('Scaling', std_trf),
#     ('Encode', enc_trf),
#     ('Ensemble voting', voting)
# ])

# pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)

print(classification_report(y_test, y_pred))
print(log_loss(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.88      0.77        34
           1       0.79      0.52      0.62        29

    accuracy                           0.71        63
   macro avg       0.74      0.70      0.70        63
weighted avg       0.73      0.71      0.70        63

10.2981866826049
