# Read in data

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [42]:
# Read in data
df = pd.read_csv('../Data/preprocessedData.csv')
df.head()

Unnamed: 0,Age,Height,Weight,Veg_Consump,Water_Consump,Meal_Count,Phys_Act,Time_E_Dev,Gender_Female,Gender_Male,...,Fam_Hist_no,Fam_Hist_yes,H_Cal_Burn_no,H_Cal_Burn_yes,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking,Body_Level
0,22.547298,1.722461,51.881263,2.663421,1.04111,3.0,0.794402,1.391948,1,0,...,0,1,1,0,0,0,0,1,0,Body Level 1
1,19.799054,1.743702,54.927529,2.0,2.847264,3.28926,1.680844,2.0,0,1,...,0,1,1,0,0,0,0,1,0,Body Level 1
2,17.823438,1.708406,50.0,1.642241,1.099231,3.45259,0.418875,1.0,1,0,...,1,0,1,0,0,0,0,1,0,Body Level 1
3,19.007177,1.690727,49.895716,1.212908,1.029703,3.207071,2.0,1.0,1,0,...,1,0,1,0,0,0,0,1,0,Body Level 1
4,19.72925,1.793315,58.19515,2.508835,2.076933,3.435905,2.026668,1.443328,0,1,...,0,1,1,0,1,0,0,0,0,Body Level 1


In [43]:
# the target is Body_Level
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## logisitic regression

In [44]:
# Create an LogisticRegression classifier
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))

# classification report
print(classification_report(y_test, y_pred))


# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accuracy of logistic regression classifier on test set: 0.78
              precision    recall  f1-score   support

Body Level 1       0.84      0.83      0.84       136
Body Level 2       0.67      0.74      0.70       133
Body Level 3       0.75      0.69      0.72       143
Body Level 4       0.85      0.86      0.85       132

    accuracy                           0.78       544
   macro avg       0.78      0.78      0.78       544
weighted avg       0.78      0.78      0.78       544

Confusion Matrix:
[[113  23   0   0]
 [ 21  98  14   0]
 [  0  25  98  20]
 [  0   0  19 113]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
# Define the parameter grid
param_grid = {'C': [0.1, 1, 10],
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']}

# Create the logistic regression classifier
logreg = LogisticRegression()

# Perform grid search with cross-validation
grid_search = GridSearchCV(logreg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Use the best model to make predictions
y_pred = best_model.predict(X_test)

# Print the accuracy and other metrics
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)




Accuracy of logistic regression classifier on test set: 0.88
              precision    recall  f1-score   support

Body Level 1       0.98      1.00      0.99       136
Body Level 2       0.76      0.75      0.76       133
Body Level 3       0.79      0.78      0.78       143
Body Level 4       0.98      0.99      0.99       132

    accuracy                           0.88       544
   macro avg       0.88      0.88      0.88       544
weighted avg       0.88      0.88      0.88       544

Confusion Matrix:
[[136   0   0   0]
 [  3 100  30   0]
 [  0  30 111   2]
 [  0   1   0 131]]


## SVM

In [46]:
# Create an SVM classifier
clf = svm.SVC(kernel='linear',)

# Train the SVM classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accuracy: 0.9338235294117647
              precision    recall  f1-score   support

Body Level 1       0.93      1.00      0.96       136
Body Level 2       0.95      0.89      0.92       133
Body Level 3       0.93      0.89      0.91       143
Body Level 4       0.93      0.96      0.95       132

    accuracy                           0.93       544
   macro avg       0.93      0.93      0.93       544
weighted avg       0.93      0.93      0.93       544

Confusion Matrix:
[[136   0   0   0]
 [ 10 118   5   0]
 [  1   6 127   9]
 [  0   0   5 127]]


In [47]:
# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],  # The regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # The kernel type
    'gamma': ['scale', 'auto'],  # The kernel coefficient
}

# Create an SVM classifier
svm_model = svm.SVC()

# Perform grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Train the SVM classifier with the best parameters
best_clf = svm.SVC(**best_params)
best_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Best Parameters: {'C': 0.1, 'gamma': 'auto', 'kernel': 'poly'}
Best Score: 0.9894305599493831
Accuracy: 0.9926470588235294
              precision    recall  f1-score   support

Body Level 1       0.99      1.00      0.99       136
Body Level 2       1.00      0.99      1.00       133
Body Level 3       0.99      0.99      0.99       143
Body Level 4       0.99      0.99      0.99       132

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

Confusion Matrix:
[[136   0   0   0]
 [  1 132   0   0]
 [  1   0 141   1]
 [  0   0   1 131]]


In [48]:
# save the model to disk
filename = '../Models/best_svm_model.sav'
pickle.dump(best_clf, open(filename, 'wb'))

In [36]:
# load the model from disk
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Make predictions on the test set
y_pred = loaded_model.predict(X_test)

print(f'y_pred: {y_pred}')

# save the predictions in a new csv file
df = pd.DataFrame(y_pred, columns=['Body_Level'])
df.to_csv('../Data/predictions.csv', index=False)


y_pred: ['Body Level 4' 'Body Level 3' 'Body Level 3' 'Body Level 2'
 'Body Level 4' 'Body Level 4' 'Body Level 2' 'Body Level 4'
 'Body Level 2' 'Body Level 2' 'Body Level 3' 'Body Level 4'
 'Body Level 1' 'Body Level 1' 'Body Level 2' 'Body Level 4'
 'Body Level 2' 'Body Level 3' 'Body Level 1' 'Body Level 1'
 'Body Level 4' 'Body Level 2' 'Body Level 4' 'Body Level 3'
 'Body Level 4' 'Body Level 4' 'Body Level 3' 'Body Level 2'
 'Body Level 4' 'Body Level 4' 'Body Level 1' 'Body Level 1'
 'Body Level 2' 'Body Level 4' 'Body Level 4' 'Body Level 1'
 'Body Level 2' 'Body Level 2' 'Body Level 1' 'Body Level 3'
 'Body Level 4' 'Body Level 4' 'Body Level 2' 'Body Level 1'
 'Body Level 2' 'Body Level 4' 'Body Level 4' 'Body Level 3'
 'Body Level 4' 'Body Level 2' 'Body Level 2' 'Body Level 4'
 'Body Level 1' 'Body Level 4' 'Body Level 2' 'Body Level 3'
 'Body Level 4' 'Body Level 1' 'Body Level 3' 'Body Level 2'
 'Body Level 2' 'Body Level 3' 'Body Level 2' 'Body Level 2'
 'Body Level 3' 

# Decision Tree

In [49]:
# Create a DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)
print('Accuracy of decision tree classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))

# Classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy of decision tree classifier on test set: 0.99
              precision    recall  f1-score   support

Body Level 1       1.00      1.00      1.00       136
Body Level 2       0.97      1.00      0.99       133
Body Level 3       0.97      0.97      0.97       143
Body Level 4       1.00      0.97      0.98       132

    accuracy                           0.99       544
   macro avg       0.99      0.99      0.99       544
weighted avg       0.99      0.99      0.99       544

Confusion Matrix:
[[136   0   0   0]
 [  0 133   0   0]
 [  0   4 139   0]
 [  0   0   4 128]]


In [50]:
# Create a DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()

# Define the parameter grid for grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)
print('Accuracy of decision tree classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))

# Classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print the best parameters found by grid search
print("Best Parameters:")
print(best_params)


Accuracy of decision tree classifier on test set: 0.98
              precision    recall  f1-score   support

Body Level 1       1.00      1.00      1.00       136
Body Level 2       0.99      0.97      0.98       133
Body Level 3       0.93      0.99      0.96       143
Body Level 4       1.00      0.95      0.98       132

    accuracy                           0.98       544
   macro avg       0.98      0.98      0.98       544
weighted avg       0.98      0.98      0.98       544

Confusion Matrix:
[[136   0   0   0]
 [  0 129   4   0]
 [  0   1 142   0]
 [  0   0   6 126]]
Best Parameters:
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}


# Random Forest

In [51]:
# Create a Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)
print('Accuracy of Random Forest classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))

# Classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy of Random Forest classifier on test set: 0.98
              precision    recall  f1-score   support

Body Level 1       1.00      1.00      1.00       136
Body Level 2       0.94      0.98      0.96       133
Body Level 3       0.97      0.95      0.96       143
Body Level 4       1.00      0.98      0.99       132

    accuracy                           0.98       544
   macro avg       0.98      0.98      0.98       544
weighted avg       0.98      0.98      0.98       544

Confusion Matrix:
[[136   0   0   0]
 [  0 131   2   0]
 [  0   7 136   0]
 [  0   1   2 129]]


In [52]:
# Create a Random Forest classifier
rf = RandomForestClassifier()

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)
print('Accuracy of Random Forest classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred)))

# Classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print the best parameters
print("Best Parameters:")
print(best_params)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Accuracy of Random Forest classifier on test set: 0.96
              precision    recall  f1-score   support

Body Level 1       1.00      0.99      1.00       136
Body Level 2       0.92      0.97      0.95       133
Body Level 3       0.96      0.91      0.93       143
Body Level 4       0.97      0.98      0.97       132

    accuracy                           0.96       544
   macro avg       0.96      0.96      0.96       544
weighted avg       0.96      0.96      0.96       544

Confusion Matrix:
[[135   1   0   0]
 [  0 129   4   0]
 [  0   9 130   4]
 [  0   1   2 129]]
Best Parameters:
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
