# Imports

In [113]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

# Dataset

In [114]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('dataset.csv')
df = df.drop('BMI', axis=1)
print('Data set shape:', df.shape)

Data set shape: (1477, 27)


In [115]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Body_Level,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Smoking,Meal_Count,...,Alcohol_Consump_no,Food_Between_Meals_Always,Food_Between_Meals_Frequently,Food_Between_Meals_Sometimes,Food_Between_Meals_no,Transport_Automobile,Transport_Bike,Transport_Motorbike,Transport_Public_Transportation,Transport_Walking
0,3,1,0.646666,0.690578,1.016135,1,-0.199318,0.236558,0,0.402155,...,0,0,0,1,0,1,0,0,0,0
1,3,1,-0.649582,-0.003364,0.686578,1,0.428856,-1.026715,0,-1.792121,...,1,0,0,1,0,0,0,0,1,0
2,3,0,-0.537725,0.493657,1.790354,1,1.069487,1.401741,0,0.402155,...,0,0,0,1,0,0,0,0,1,0
3,3,0,-1.291128,-0.545353,-0.036499,1,0.986227,-1.620907,0,0.402155,...,1,0,0,1,0,0,0,0,1,0
4,3,0,0.270463,-0.818715,0.951256,1,1.069487,0.97415,0,0.402155,...,0,0,0,1,0,0,0,0,1,0


In [116]:
#Label percentages
df['Body_Level'].value_counts(normalize=True)

3    0.460393
2    0.274882
1    0.136087
0    0.128639
Name: Body_Level, dtype: float64

In [117]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('Body_Level', axis=1), df['Body_Level'], test_size=0.2, random_state=42)
# Print the shapes of the training and test sets
print('Training set shape:', X_train.shape)
print('Test set shape:', X_test.shape)

Training set shape: (1181, 26)
Test set shape: (296, 26)


In [118]:
#Label percentages
Y_train.value_counts(normalize=True)

3    0.468247
2    0.271804
1    0.133785
0    0.126164
Name: Body_Level, dtype: float64

In [119]:
#Label percentages
Y_test.value_counts(normalize=True)

3    0.429054
2    0.287162
1    0.145270
0    0.138514
Name: Body_Level, dtype: float64

# Common Functions

In [120]:
def apply_crossvalidation(model, X_train, Y_train, k=10):

    # create a k-fold cross-validation iterator
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # perform k-fold cross-validation and compute accuracy
    scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')
    # print the average accuracy score and its standard deviation
    print('Accuracy: {} +/- {}'.format(scores.mean(), scores.std()))

    # perform k-fold cross-validation and compute F1-score
    scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='f1_weighted')
    # print the average F1-score and its standard deviation
    print('F1-score: {} +/- {}'.format(scores.mean(), scores.std()))

    # # perform k-fold cross-validation and compute AUC
    # scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='roc_auc')
    # # print the average AUC score and its standard deviation
    # print('AUC: {} +/- {}'.format(scores.mean(), scores.std()))

In [121]:
def Evaluate(model, X_test, Y_test):
    
    # predict the class labels for the test set
    y_pred = model.predict(X_test)

    # calculate the accuracy
    accuracy = accuracy_score(Y_test, y_pred)

    # calculate the precision
    precision = precision_score(Y_test, y_pred, average='weighted')

    # calculate the recall
    recall = recall_score(Y_test, y_pred, average='weighted')

    # calculate the F1 score
    f1 = f1_score(Y_test, y_pred, average='weighted')

    # calculate the confusion matrix
    cm = confusion_matrix(Y_test, y_pred)

    # print the results
    print('Accuracy: {}'.format(accuracy))
    print('weighted F1 score: {}'.format(f1))
    print('weighted Precision: {}'.format(precision))
    print('weighted Recall: {}'.format(recall))
    print('Confusion matrix:\n', cm)

# logistic regression

simple fit and evaluate

In [122]:
# Create a logistic regression object
lorg_base = LogisticRegression(random_state=42)

# Fit the model to the training data
lorg_base.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [123]:
# Predict the labels of the test set
y_pred = lorg_base.predict(X_test)

# Evaluate the model
Evaluate(lorg_base, X_test, Y_test)

Accuracy: 0.9324324324324325
weighted F1 score: 0.929892212630901
weighted Precision: 0.9306718509164161
weighted Recall: 0.9324324324324325
Confusion matrix:
 [[ 41   0   0   0]
 [  5  29   9   0]
 [  0   6  79   0]
 [  0   0   0 127]]


In [124]:
# Look at parameters used by our current logistic regression model
print('Parameters currently in use:\n')
pprint(lorg_base.get_params())

Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


Hyperparameter Tuning
- Logistic regression does not really have any critical hyperparameters to tune.
- The main hyperparameters we may tune in logistic regression are: solver, penalty, and regularization strength (sklearn documentation).
- grid search is enough here.

In [125]:
# Create the parameter grid
param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
}
# Create a based model
lorg = LogisticRegression(random_state=42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lorg, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 2)

In [126]:
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 140 candidates, totalling 700 fits


315 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "d:\programing\programs\anaconda\envs\pytorch\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\programing\programs\anaconda\envs\pytorch\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\programing\programs\anaconda\envs\pytorch\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' o

In [127]:
grid_search.best_params_

{'C': 1000, 'penalty': 'l2', 'solver': 'newton-cg'}

In [128]:
best_grid = LogisticRegression(**grid_search.best_params_, random_state=42)
best_grid.fit(X_train, Y_train)
Evaluate(best_grid, X_test, Y_test)

Accuracy: 0.972972972972973
weighted F1 score: 0.9733717643618115
weighted Precision: 0.9749176259592925
weighted Recall: 0.972972972972973
Confusion matrix:
 [[ 40   1   0   0]
 [  1  42   0   0]
 [  0   5  80   0]
 [  0   0   1 126]]
