# Natural Computing - Assignment 5 - Ensemble Learning
## Exercise 5
#### Submission by group 25 (Chihab Amghane, Max Driessen, Jordy Naus)

This file contains our code for exercise 5 of the "Ensemble Learning" assignment of the Natural Computing course.

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,plot_confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from tqdm import tqdm

import os

## Importing data

In [2]:
# onehot encoding
def onehotencode(string,encoder,possibilities):
    encoder = OneHotEncoder()
    possibilities =[[x] for x in possibilities]
    encoder.fit(possibilities)
    return encoder.transform([[string]]).toarray()[0]


In [3]:
def remove_zeros(row,column):
    if(type(row[column]) is int):
        row[column] = 'N'
    return row

In [4]:
data_loc = '../../../ensemble_learning/titanic'
dataframe_train = pd.read_csv(os.path.join(data_loc,'train.csv'))

dataframe_train = dataframe_train.fillna(0)
## onehotencode sex
onehotencode_sex = pd.get_dummies(dataframe_train['Sex'])
dataframe_train['male'] = onehotencode_sex['male']
dataframe_train['female'] = onehotencode_sex['female']

## onehotencode embarked
dataframe_train = dataframe_train.apply(remove_zeros, column='Embarked',axis=1)
onehotencode_embarked = pd.get_dummies(dataframe_train['Embarked'])

for col in onehotencode_embarked.columns:
    dataframe_train[col] = onehotencode_embarked[col]

y = dataframe_train['Survived'].to_numpy()
dataframe_train = dataframe_train.drop(columns=['PassengerId','Survived','Name', 'Ticket','Cabin','Sex','Embarked'])
X = dataframe_train.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

In [5]:
clf = RandomForestClassifier(max_depth=5, random_state=0, oob_score=True)


In [None]:
min_forest_size = 100
max_forest_size = 2500
forest_data = {forest_size:{} for forest_size in range(min_forest_size,max_forest_size+100)}
forest_predictions = {forest_size:[] for forest_size in range(min_forest_size,max_forest_size+100)}
for forest_size in tqdm(range(min_forest_size,max_forest_size+100,10)):
    clf.set_params(n_estimators=forest_size)
    clf.fit(X_train,y_train)
    data_dict = {}
    
    ## We are interested in accuracy,and oob scores
    predictions = clf.predict(X_test)
    accuracy = clf.score(X_test,y_test)
    oob_error = 1 - clf.oob_score_
    
    forest_predictions[forest_size] = predictions
    data_dict['accuracy'] = accuracy
    data_dict['oob_error'] = oob_error
    forest_data[forest_size] = data_dict

 24%|██▍       | 60/250 [00:48<04:00,  1.27s/it]

In [None]:
accuracies = [forest_data[x]['accuracy'] for x in range(min_forest_size,max_forest_size+100,10)]
oob_errors = [forest_data[x]['oob_error'] for x in range(min_forest_size,max_forest_size+100,10)]

In [None]:
plt.plot(range(min_forest_size,max_forest_size+100,10),accuracies)
plt.title('Overview of the effect of the number of estimators on the accuracy')
plt.xlabel('number of trees')
plt.ylabel('Accuracy')
plt.show()
plt.plot(range(min_forest_size,max_forest_size+100,10),oob_errors,'tab:orange')
plt.title('Overview of the effect of the number of estimators on the OOB error')
plt.xlabel('number of trees')
plt.ylabel('OOB error')
plt.show()

In [None]:
print(f"The index of the highest accuracy {np.argmax(np.array(accuracies))}")
print(f"The index of the lowest ebo {np.argmin(np.array(oob_errors))}")
print(f"the number of trees {range(min_forest_size,max_forest_size+100,10)[51]}")
print(f"oob error {oob_errors[180]}")
print(accuracies[180])

In [None]:
min_nr_features = 1
max_nr_features = 7
features_forest_data = {forest_size:{} for forest_size in range(min_nr_features,max_nr_features+1)}
features_forest_predictions = {forest_size:[] for forest_size in range(min_nr_features,max_nr_features+1)}
for nr_of_features in tqdm(range(min_nr_features,max_nr_features)):
    clf.set_params(n_estimators=610, max_features = nr_of_features)
    clf.fit(X_train,y_train)
    data_dict = {}

    ## We are interested in accuracy,and oob scores
    predictions = clf.predict(X_test)
    accuracy = clf.score(X_test,y_test)
    oob_error = 1 - clf.oob_score_

    features_forest_predictions[nr_of_features] = predictions
    data_dict['accuracy'] = accuracy
    data_dict['oob_error'] = oob_error
    features_forest_data[nr_of_features] = data_dict    




In [None]:
feature_accuracies = [features_forest_data[x]['accuracy'] for x in range(min_nr_features,max_nr_features)]
feature_oob_errors = [features_forest_data[x]['oob_error'] for x in range(min_nr_features,max_nr_features)]

In [None]:
plt.plot(range(min_nr_features,max_nr_features),feature_accuracies)
plt.title('Overview of the effect of the number of features on the accuracy')
plt.xlabel('number of features')
plt.ylabel('Accuracy')
plt.show()
plt.plot(range(min_nr_features,max_nr_features),feature_oob_errors,'tab:orange')
plt.title('Overview of the effect of the number of features on the OOB error')
plt.xlabel('number of features')
plt.ylabel('OOB error')
plt.show()

## Best Model

In [None]:
clf.set_params(n_estimators=610, max_features = 2)
clf.fit(X_train,y_train)

predictions = clf.predict(X_test)
accuracy = clf.score(X_test,y_test)
oob_error = 1 - clf.oob_score_


In [None]:
plot_confusion_matrix(clf,X_test,y_test, display_labels=['Dead','Survived'],values_format='0.5g', cmap=plt.cm.Blues)


In [None]:
print(classification_report(y_test,predictions, target_names=["Dead", "Survived"]))

In [None]:
## class imbalance
print(y[y==0].shape)
print(y[y==1].shape)

## Feature importance

In [None]:
feature_importances = clf.feature_importances_
sorted_ids = np.argsort(feature_importances)
feature_names = dataframe_train.columns
fig,ax = plt.subplots()
y_ticks = range(0, len(feature_importances))
ax.barh(y_ticks,feature_importances[sorted_ids])
ax.set_yticklabels(feature_names[sorted_ids])
ax.set_yticks(y_ticks)
ax.set_ylabel('Feature')
ax.set_xlabel('Importance')
fig.suptitle('Feature Importance')
# fig.tight_layout()
plt.show()

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(clf, X_test, y_test, n_repeats=10)
permutation_importances = result.importances_mean
sorted_ids = np.argsort(permutation_importances)
fix,ax = plt.subplots()
ax.boxplot(result.importances[sorted_ids].T, vert=False, labels=feature_names[sorted_ids])
ax.set_title('Permutation Importances on the Test set')
ax.set_ylabel('Feature')
ax.set_xlabel('Permutation Importance')
plt.show()