# Imports

In [17]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

# Dataset

In [18]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('dataset.csv')
print('Data set shape:', df.shape)

Data set shape: (1477, 28)


In [19]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Body_Level,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Smoking,Meal_Count,...,Food_Between_Meals_Always,Food_Between_Meals_Frequently,Food_Between_Meals_Sometimes,Food_Between_Meals_no,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking,BMI
0,3,1,0.646666,0.690578,1.016135,1,-0.199318,0.236558,0,0.402155,...,0,0,1,0,1,0,0,0,0,0.823278
1,3,1,-0.649582,-0.003364,0.686578,1,0.428856,-1.026715,0,-1.792121,...,0,0,1,0,0,0,0,1,0,0.806112
2,3,0,-0.537725,0.493657,1.790354,1,1.069487,1.401741,0,0.402155,...,0,0,1,0,0,0,0,1,0,1.751663
3,3,0,-1.291128,-0.545353,-0.036499,1,0.986227,-1.620907,0,0.402155,...,0,0,1,0,0,0,0,1,0,0.219426
4,3,0,0.270463,-0.818715,0.951256,1,1.069487,0.97415,0,0.402155,...,0,0,1,0,0,0,0,1,0,1.574457


In [20]:
#Label percentages
df['Body_Level'].value_counts(normalize=True)

3    0.460393
2    0.274882
1    0.136087
0    0.128639
Name: Body_Level, dtype: float64

In [21]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('Body_Level', axis=1), df['Body_Level'], test_size=0.2, random_state=42)
# Print the shapes of the training and test sets
print('Training set shape:', X_train.shape)
print('Test set shape:', X_test.shape)

Training set shape: (1181, 27)
Test set shape: (296, 27)


In [22]:
#Label percentages
Y_train.value_counts(normalize=True)

3    0.468247
2    0.271804
1    0.133785
0    0.126164
Name: Body_Level, dtype: float64

In [23]:
#Label percentages
Y_test.value_counts(normalize=True)

3    0.429054
2    0.287162
1    0.145270
0    0.138514
Name: Body_Level, dtype: float64

# Common Functions

In [24]:
def apply_crossvalidation(model, X_train, Y_train, k=10):

    # create a k-fold cross-validation iterator
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # perform k-fold cross-validation and compute accuracy
    scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')
    # print the average accuracy score and its standard deviation
    print('Accuracy: {} +/- {}'.format(scores.mean(), scores.std()))

    # perform k-fold cross-validation and compute F1-score
    scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='f1_weighted')
    # print the average F1-score and its standard deviation
    print('F1-score: {} +/- {}'.format(scores.mean(), scores.std()))

    # # perform k-fold cross-validation and compute AUC
    # scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='roc_auc')
    # # print the average AUC score and its standard deviation
    # print('AUC: {} +/- {}'.format(scores.mean(), scores.std()))

In [25]:
def Evaluate(model, X_test, Y_test):
    
    # predict the class labels for the test set
    y_pred = model.predict(X_test)

    # calculate the accuracy
    accuracy = accuracy_score(Y_test, y_pred)

    # calculate the precision
    precision = precision_score(Y_test, y_pred, average='weighted')

    # calculate the recall
    recall = recall_score(Y_test, y_pred, average='weighted')

    # calculate the F1 score
    f1 = f1_score(Y_test, y_pred, average='weighted')

    # calculate the confusion matrix
    cm = confusion_matrix(Y_test, y_pred)

    # print the results
    print('Accuracy: {}'.format(accuracy))
    print('weighted F1 score: {}'.format(f1))
    print('weighted Precision: {}'.format(precision))
    print('weighted Recall: {}'.format(recall))
    print('Confusion matrix:\n', cm)

# Random Forest

simple fit and evaluate

In [26]:
# Create a random forest object
rf_base = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf_base.fit(X_train, Y_train)

In [27]:
# Predict the labels of the test set
y_pred = rf_base.predict(X_test)

# Evaluate the model
Evaluate(rf_base, X_test, Y_test)

Accuracy: 0.9966216216216216
weighted F1 score: 0.9966182506906488
weighted Precision: 0.9966480152027027
weighted Recall: 0.9966216216216216
Confusion matrix:
 [[ 41   0   0   0]
 [  0  43   0   0]
 [  0   0  84   1]
 [  0   0   0 127]]


In [28]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf_base.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


Hyperparameter Tuning

- normally we would do a random search to narrow down the range of hyperparameters and then do a grid search to find the best hyperparameters
- since the model already has a good performance, we will just do a grid search

In [29]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None],
    'max_features': ['sqrt', 'auto', 'log2'], #
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'n_estimators': [100, 200, 400, 800, 1000]  #
}
# Create a based model
rf = RandomForestClassifier(random_state=42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)

In [30]:
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [31]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [32]:
best_grid = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_grid.fit(X_train, Y_train)
Evaluate(best_grid, X_test, Y_test)

Accuracy: 0.9966216216216216
weighted F1 score: 0.9966182506906488
weighted Precision: 0.9966480152027027
weighted Recall: 0.9966216216216216
Confusion matrix:
 [[ 41   0   0   0]
 [  0  43   0   0]
 [  0   0  84   1]
 [  0   0   0 127]]


In [None]:
"""
We splitted the data into 80% training and 20% test sets.

BMI feature is one of the most important features for Random Forest model.

If we included the BMI feature the simple random forest model with default paramters 
achieves a weighted F1-score of 99.66% on the test set.

However, if we excluded the BMI feature achieves a weighted F1-score of 94.64% on the test set.

Since the random forest model using BMI with default paramters already achieves a very high accuracy and F1-score,
we tried to tune the hyperparameters around the default values using grid search and cross-validation,
but the hyperparameter tuning had insignificant effect on the accuracy and F1-score.

However, if we fine-tuned the models without BMI feature using grid search and cross-validation,
the weighted F1-score on the test set increased from 94.64% to 95.33% but still lower than the model with BMI feature.

The main hyperparameters that affect the performance of the random forest model are the number of trees in the forest and 
the max number of features considered for splitting a node.
"""