# Ensemble Learning

### import requried packages

In [39]:
import numpy as mp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load the data

In [40]:
df = pd.read_csv('/tmp/hearing_test.csv')
df.head(3)

Unnamed: 0,age,physical_score,test_result
0,33.0,40.7,1
1,50.0,37.2,1
2,52.0,24.7,0


### EDA

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             5000 non-null   float64
 1   physical_score  5000 non-null   float64
 2   test_result     5000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 117.3 KB


In [42]:
df.describe()

Unnamed: 0,age,physical_score,test_result
count,5000.0,5000.0,5000.0
mean,51.609,32.76026,0.6
std,11.287001,8.169802,0.489947
min,18.0,-0.0,0.0
25%,43.0,26.7,0.0
50%,51.0,35.3,1.0
75%,60.0,38.9,1.0
max,90.0,50.0,1.0


In [43]:
df.corr()

Unnamed: 0,age,physical_score,test_result
age,1.0,-0.782146,-0.683171
physical_score,-0.782146,1.0,0.792716
test_result,-0.683171,0.792716,1.0


### Data PreProcessing

In [44]:
# split the data into X and Y
X = df.drop('test_result', axis=1)
Y = df['test_result']

In [45]:
from sklearn.model_selection import train_test_split

# split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=12345)

### model building

In [46]:
def build_dt_model():
    from sklearn.tree import DecisionTreeClassifier
    
    # create a model
    model = DecisionTreeClassifier()
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [47]:
def build_lg_model():
    from sklearn.linear_model import LogisticRegressionCV
    
    # create a model
    model = LogisticRegressionCV()
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [48]:
def build_knn_model():
    from sklearn.neighbors import KNeighborsClassifier
    
    # create a model
    model = KNeighborsClassifier(n_neighbors=9)
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [49]:
def build_nb_model():
    from sklearn.naive_bayes import GaussianNB
    
    # create a model
    model = GaussianNB()
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [50]:
def build_svm_model():
    from sklearn.svm import SVC
    
    # create a model
    model = SVC()
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [51]:
def build_rf_model():
    from sklearn.ensemble import RandomForestClassifier
    
    # create a model
    model = RandomForestClassifier(n_estimators=100)
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [52]:
def build_cb_model():
    from catboost import CatBoostClassifier
    
    # create a model
    model = CatBoostClassifier()
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

In [53]:
def build_xgb_model():
    from xgboost import XGBClassifier
    
    # create a model
    model = XGBClassifier()
    
    # train the model
    model.fit(x_train, y_train)
    
    return model

### model evaluation

In [54]:
def evaluate_model(model):
    y_true = y_test
    y_pred = model.predict(x_test)
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    accuracy = f"{accuracy_score(y_true, y_pred):.2f}"
    precision = f"{precision_score(y_true, y_pred):.2f}"
    recall = f"{recall_score(y_true, y_pred):.2f}"
    f1 = f"{f1_score(y_true, y_pred):.2f}"

    return accuracy, precision, recall, f1

In [55]:
# create the models and evaluate them
evaluation_report = []

model_dt = build_dt_model()
evaluation_report.append({"model": "Decision Tree", "metrics": evaluate_model(model_dt)})

model_knn = build_knn_model()
evaluation_report.append({"model": "KNN", "metrics": evaluate_model(model_knn)})

model_lg = build_lg_model()
evaluation_report.append({"model": "Logistic Regression", "metrics": evaluate_model(model_lg)})

model_svm = build_svm_model()
evaluation_report.append({"model": "SVM", "metrics": evaluate_model(model_svm)})

model_nb = build_nb_model()
evaluation_report.append({"model": "Naive Bayes", "metrics": evaluate_model(model_nb)})

model_rf = build_rf_model()
evaluation_report.append({"model": "Random Forest", "metrics": evaluate_model(model_rf)})

model_cb = build_cb_model()
evaluation_report.append({"model": "CatBoost", "metrics": evaluate_model(model_cb)})

model_xgb = build_xgb_model()
evaluation_report.append({"model": "XGB", "metrics": evaluate_model(model_xgb)})


Learning rate set to 0.017589
0:	learn: 0.6682629	total: 58.4ms	remaining: 58.3s
1:	learn: 0.6454163	total: 59.6ms	remaining: 29.8s
2:	learn: 0.6226585	total: 60.9ms	remaining: 20.2s
3:	learn: 0.5971813	total: 62ms	remaining: 15.4s
4:	learn: 0.5739082	total: 63.1ms	remaining: 12.6s
5:	learn: 0.5559613	total: 64.3ms	remaining: 10.7s
6:	learn: 0.5368406	total: 65.5ms	remaining: 9.29s
7:	learn: 0.5197703	total: 66.5ms	remaining: 8.25s
8:	learn: 0.5045958	total: 67.7ms	remaining: 7.45s
9:	learn: 0.4870376	total: 68.7ms	remaining: 6.8s
10:	learn: 0.4723023	total: 69.7ms	remaining: 6.27s
11:	learn: 0.4600774	total: 70.6ms	remaining: 5.81s
12:	learn: 0.4459284	total: 71.7ms	remaining: 5.45s
13:	learn: 0.4349750	total: 72.7ms	remaining: 5.12s
14:	learn: 0.4228977	total: 73.9ms	remaining: 4.85s
15:	learn: 0.4134495	total: 75ms	remaining: 4.61s
16:	learn: 0.4021565	total: 76.2ms	remaining: 4.41s
17:	learn: 0.3910031	total: 77.3ms	remaining: 4.22s
18:	learn: 0.3814672	total: 78.3ms	remaining: 4.0

In [56]:
print(f"{'Algorithm':<20} => 'Accuracy', 'precision', 'recall', 'f1'")
print()
for info in evaluation_report:
    print(f"{info['model']:<20} => {info['metrics']}")

Algorithm            => 'Accuracy', 'precision', 'recall', 'f1'

Decision Tree        => ('0.88', '0.92', '0.88', '0.90')
KNN                  => ('0.93', '0.92', '0.96', '0.94')
Logistic Regression  => ('0.92', '0.92', '0.95', '0.94')
SVM                  => ('0.93', '0.93', '0.97', '0.95')
Naive Bayes          => ('0.92', '0.93', '0.93', '0.93')
Random Forest        => ('0.91', '0.92', '0.93', '0.93')
CatBoost             => ('0.93', '0.93', '0.96', '0.94')
XGB                  => ('0.93', '0.93', '0.95', '0.94')


### problem

- predict if person having age 35 and physical score 20 will pass hearing test

In [33]:
prediction_dt = model_dt.predict([[35, 20]])
"Pass" if prediction_dt[0] == 1 else "Fail"



'Fail'

In [34]:
prediction_knn = model_knn.predict([[35, 20]])
"Pass" if prediction_knn[0] == 1 else "Fail"



'Fail'

In [35]:
prediction_lg = model_lg.predict([[35, 20]])
"Pass" if prediction_lg[0] == 1 else "Fail"



'Fail'

In [36]:
prediction_nb = model_nb.predict([[35, 20]])
"Pass" if prediction_nb[0] == 1 else "Fail"



'Fail'

In [37]:
prediction_svm = model_svm.predict([[35, 20]])
"Pass" if prediction_svm[0] == 1 else "Fail"



'Fail'