# **ENSEMBLE METHOD**

In [39]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score, classification_report

# Voting

### Load Dataset

In [40]:
df_wine = pd.read_csv('white_wine.csv')
df_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
515,6.1,0.31,0.26,2.2,0.051,28.0,167.0,0.9926,3.37,0.47,10.4,6.0
516,6.8,0.18,0.37,1.6,0.055,47.0,154.0,0.9934,3.08,0.45,9.1,5.0
517,7.4,0.15,0.42,1.7,0.045,49.0,154.0,0.9920,3.00,0.60,10.4,6.0
518,5.9,0.13,0.28,1.9,0.050,20.0,78.0,0.9918,3.43,0.64,10.8,6.0


In [41]:
# impute missing value
df_wine['alcohol'].fillna(df_wine['alcohol'].median(), inplace=True)

In [42]:
# rename target
df_wine['label'] = np.where(df_wine['quality']>6, 1, 0)

### Data Splitting

In [43]:
# define x y
X = df_wine[['alcohol','density']]
y = df_wine['label']

In [44]:
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, random_state=10, test_size=0.2)

### Modeling (Base Learner)

In [45]:
# base learner
logreg = LogisticRegression(random_state=0)
knn = KNeighborsClassifier(n_neighbors=3)
tree = DecisionTreeClassifier(max_depth=5, random_state=0)

In [46]:
list_model = [logreg, knn, tree]

for i in list_model:

    # fitting
    model = i
    model.fit(X_train, y_train)

    # predict
    y_pred = model.predict(X_test)
    print(i)
    print(f1_score(y_test, y_pred))

LogisticRegression(random_state=0)
0.45161290322580644
KNeighborsClassifier(n_neighbors=3)
0.6111111111111112
DecisionTreeClassifier(max_depth=5, random_state=0)
0.8648648648648648


### Voting Classifier

In [51]:
# base learner
logreg = LogisticRegression(random_state=0)
knn = KNeighborsClassifier(n_neighbors=3)
tree = DecisionTreeClassifier(max_depth=5, random_state=0)

In [52]:
# Voting Classifier
voting_clf = VotingClassifier([
    ('clf1', logreg),
    ('clf2', knn),
    ('clf3', tree)
])

# fitting
voting_clf.fit(X_train, y_train)

# predict
y_pred = voting_clf.predict(X_test)

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92        84
           1       0.71      0.50      0.59        20

    accuracy                           0.87       104
   macro avg       0.80      0.73      0.75       104
weighted avg       0.86      0.87      0.86       104



### Improve model

In [55]:
# logreg
logreg = LogisticRegression(random_state=0)
poly = PolynomialFeatures(degree=3)
logreg_pipe = Pipeline([
    ('poly', poly),
    ('model', logreg)
])

# knn
knn = KNeighborsClassifier(n_neighbors=3)
scaler = StandardScaler()
knn_pipe = Pipeline([
    ('scaler', scaler),
    ('model', knn)
])

# dec tree
tree = DecisionTreeClassifier(max_depth=5, random_state=0)

In [58]:
list_model = [logreg_pipe, knn_pipe, tree]

for i in list_model:

    # fitting
    model = i
    model.fit(X_train, y_train)

    # predict
    y_pred = model.predict(X_test)
    print(i)
    print(f1_score(y_test, y_pred))

Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                ('model', LogisticRegression(random_state=0))])
0.8571428571428571
Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier(n_neighbors=3))])
0.9268292682926829
DecisionTreeClassifier(max_depth=5, random_state=0)
0.8648648648648648


In [56]:
# Voting Classifier
voting_clf = VotingClassifier([
    ('clf1', logreg_pipe),
    ('clf2', knn_pipe),
    ('clf3', tree)
])

# fitting
voting_clf.fit(X_train, y_train)

# predict
y_pred = voting_clf.predict(X_test)

In [57]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        84
           1       0.94      0.80      0.86        20

    accuracy                           0.95       104
   macro avg       0.95      0.89      0.92       104
weighted avg       0.95      0.95      0.95       104



## Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
# base learner
logreg = LogisticRegression(random_state=0)
knn = KNeighborsClassifier(n_neighbors=3)
tree = DecisionTreeClassifier(max_depth=5, random_state=0)
rf = RandomForestClassifier(random_state=0)

In [62]:
list_model = [logreg_pipe, knn_pipe, tree, rf]

for i in list_model:

    # fitting
    model = i
    model.fit(X_train, y_train)

    # predict
    y_pred = model.predict(X_test)
    print(i)
    print(f1_score(y_test, y_pred))

Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                ('model', LogisticRegression(random_state=0))])
0.8571428571428571
Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier(n_neighbors=3))])
0.9268292682926829
DecisionTreeClassifier(max_depth=5, random_state=0)
0.8648648648648648
RandomForestClassifier(random_state=0)
0.9268292682926829


Dari 4 kandidat model, RandomForest adalah model terbaik saat diprediksi ke Test set

#### Cross Validation (mencari model terbaik)

In [65]:
list_model = [logreg_pipe, knn_pipe, tree, rf]

f1_mean = []
f1_std = []

for i in list_model:

    model_cv = cross_val_score(
        estimator=i,
        X=X_train,
        y=y_train,
        scoring='f1',
        cv=5
    )

    print(model_cv, i)

    f1_mean.append(model_cv.mean())
    f1_std.append(model_cv.std())


[0.81481481 0.58333333 0.66666667 0.78571429 0.76923077] Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                ('model', LogisticRegression(random_state=0))])
[0.90322581 0.93333333 0.90322581 0.96969697 0.90322581] Pipeline(steps=[('scaler', StandardScaler()),
                ('model', KNeighborsClassifier(n_neighbors=3))])
[0.86666667 0.88888889 0.86666667 0.9375     0.81481481] DecisionTreeClassifier(max_depth=5, random_state=0)
[0.90322581 0.93333333 0.90322581 0.9375     0.96774194] RandomForestClassifier(random_state=0)


In [66]:
pd.DataFrame({
    'model': list_model,
    'f1_mean': f1_mean,
    'f1_std': f1_std,
})

Unnamed: 0,model,f1_mean,f1_std
0,"(PolynomialFeatures(degree=3), LogisticRegress...",0.723952,0.086228
1,"(StandardScaler(), KNeighborsClassifier(n_neig...",0.922542,0.026304
2,"DecisionTreeClassifier(max_depth=5, random_sta...",0.874907,0.039649
3,"(DecisionTreeClassifier(max_features='auto', r...",0.929005,0.024169


Dari 4 kandidat model, RandomForest adalah model terbaik berdasarkan cross validation pada training set

### Hyperparameter Tuning

In [None]:
RandomForestClassifier()

In [69]:
# list(range(100, 1000, 1))

In [75]:
# hyperparam space 

hyperparam_space = {
    'n_estimators': list(range(100, 1000, 1)),       # jumlah pohon
    'max_depth':  list(range(1, 100, 1)),                                 # kedalaman pohon
    'max_features': list(range(1, 2, 1)),                               # jumlah feature untuk tiap splitting
    'min_samples_split': list(range(2, 100, 1)),                         # jumlah sample sebelum splitting
    'min_samples_leaf': list(range(1, 100, 1))                         # jumlah sample sesudah splitting
}

In [76]:
# randomized search
radom_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=hyperparam_space,
    n_iter=20,
    scoring='f1',
    cv=5,
    n_jobs=-1
)

radom_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'max_features': [1],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                             17, 18, 19, 20, 21,
                                                             22, 23, 24, 25, 26,
                                    

In [77]:
print('Best f1 score', radom_search.best_score_)
print('Best param', radom_search.best_params_)


Best f1 score 0.8274586333207022
Best param {'n_estimators': 111, 'min_samples_split': 62, 'min_samples_leaf': 10, 'max_features': 1, 'max_depth': 90}


In [81]:
pd.DataFrame(radom_search.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,1.520479,0.126175,0.16646,0.029561,111,62,10,1,90,"{'n_estimators': 111, 'min_samples_split': 62,...",0.857143,0.75,0.769231,0.827586,0.933333,0.827459,0.065545,1
18,3.79341,0.204924,0.335141,0.01504,265,61,17,1,15,"{'n_estimators': 265, 'min_samples_split': 61,...",0.857143,0.695652,0.769231,0.827586,0.857143,0.801351,0.061835,2
8,9.006546,0.282866,0.791511,0.078981,664,45,25,1,77,"{'n_estimators': 664, 'min_samples_split': 45,...",0.814815,0.695652,0.666667,0.785714,0.896552,0.77188,0.083016,3
14,5.053441,0.253177,0.533851,0.089037,333,78,14,1,54,"{'n_estimators': 333, 'min_samples_split': 78,...",0.814815,0.695652,0.72,0.785714,0.814815,0.766199,0.049435,4
16,3.066214,0.053691,0.295393,0.025174,255,82,27,1,74,"{'n_estimators': 255, 'min_samples_split': 82,...",0.769231,0.695652,0.666667,0.740741,0.857143,0.745887,0.065953,5
2,11.795086,0.511736,1.295425,0.181602,657,60,27,1,62,"{'n_estimators': 657, 'min_samples_split': 60,...",0.769231,0.695652,0.666667,0.740741,0.857143,0.745887,0.065953,5
11,12.320641,0.217236,1.094186,0.148616,677,67,28,1,97,"{'n_estimators': 677, 'min_samples_split': 67,...",0.769231,0.695652,0.666667,0.740741,0.769231,0.728304,0.040914,7
13,4.40757,0.14316,0.473554,0.099747,290,90,39,1,44,"{'n_estimators': 290, 'min_samples_split': 90,...",0.608696,0.695652,0.666667,0.740741,0.72,0.686351,0.046032,8
5,7.044366,0.242724,0.695964,0.071888,532,91,39,1,45,"{'n_estimators': 532, 'min_samples_split': 91,...",0.608696,0.695652,0.666667,0.740741,0.47619,0.637589,0.091369,9
0,10.00105,0.527938,0.916692,0.0886,491,3,42,1,9,"{'n_estimators': 491, 'min_samples_split': 3, ...",0.545455,0.695652,0.666667,0.740741,0.4,0.609703,0.123212,10
