In [1]:
import os, psutil
import gc

import numpy as np
import pandas as pd
import scipy as sci
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import sklearn
sklearn.set_config(transform_output="pandas")

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB 

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_validate,cross_val_score,train_test_split, KFold, GridSearchCV, LearningCurveDisplay, ValidationCurveDisplay
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score, confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.metrics import fowlkes_mallows_score, homogeneity_completeness_v_measure, adjusted_mutual_info_score, normalized_mutual_info_score, adjusted_rand_score, rand_score
from sklearn import ensemble,metrics,model_selection,neighbors,preprocessing, svm, tree
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

import lightgbm as lgb
from lightgbm import LGBMClassifier

from statsmodels.graphics.mosaicplot import mosaic

from time import time
import warnings

import shap

from sklearn.cluster import KMeans, BisectingKMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
# Modified from HW 5 Q2 Part a.
def train_and_evaluate_classifier(name, clf, X_train, y_train, X_test, y_test, cv=5, debug=False, learning_curve=False, confusion=False, supress_outputs=True):
  print(name)
    
  t0 = time()
  # Fit your classifier on the training set
  ### START CODE ###
  clf.fit(X_train, y_train)
  ### END CODE ###
  if supress_outputs == False:
      print("training time", round(time()-t0, 3), "s")

  t0 = time()
  y_train_pred = clf.predict(X_train)
  y_test_pred = clf.predict(X_test)
  if supress_outputs == False:
      print("predict time", round(time()-t0, 3), "s")

  if supress_outputs == False:
    print("Confusion matrix: ")
  # Print the confusion matrix computed from the test set (1 line of code only)
  ### START CODE ###
  #print(confusion_matrix(y_test, y_pred))
  # ConfusionMatrixDisplay.from_estimator(clf, X_train, y_train)
  if confusion == True:
      ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test)
      plt.show()
  ### END CODE ###


  ### START CODE ###
  y_train_pred_proba = clf.predict_proba(X_train)
  y_test_pred_proba = clf.predict_proba(X_test)
    
  if debug == True:
      print(y_test_pred_proba)

  train_acc_score = accuracy_score(y_train, y_train_pred)
  test_acc_score = accuracy_score(y_test, y_test_pred)

  train_auc_score_ovo = roc_auc_score(y_train, y_train_pred_proba, multi_class='ovo')  
  train_auc_score_ovr = roc_auc_score(y_train, y_train_pred_proba, multi_class='ovr')
    
  test_auc_score_ovo = roc_auc_score(y_test, y_test_pred_proba, multi_class='ovo')  
  test_auc_score_ovr = roc_auc_score(y_test, y_test_pred_proba, multi_class='ovr')
  ### END CODE ###
    
  if supress_outputs == False:
      print("TRAIN - Accuracy: {}, AUC_ROC_OVO: {}, AUC_ROC_OVR: {}".format(train_acc_score, train_auc_score_ovo, train_auc_score_ovr))
      print("TEST - Accuracy: {}, AUC_ROC_OVO: {}, AUC_ROC_OVR: {}".format(test_acc_score, test_auc_score_ovo, test_auc_score_ovr))

  # Display Learning curve
  if learning_curve == True:
      LearningCurveDisplay.from_estimator(clf, X_train, y_train, scoring='accuracy')
  #RocCurveDisplay.from_estimator(clf, X_test, y_test)

  train_results = [train_acc_score, train_auc_score_ovo, train_auc_score_ovr]
  test_results = [test_acc_score, test_auc_score_ovo, test_auc_score_ovr]
  return train_results, test_results

In [3]:
# Modified from HW 5 Q2 Part a.
def train_and_evaluate_classifierCV(name, clf, X_train, y_train, X_test, y_test, cv=5, debug=False, learning_curve=False, confusion=False, supress_outputs=True):
  print(name)
    
  t0 = time()
  # Fit your classifier on the training set
  ### START CODE ###
  clf.fit(X_train, y_train)
  scores = cross_validate(clf, X_train, y_train, scoring=['accuracy', 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted'], cv=cv, return_estimator=True, return_train_score=True)
  ### END CODE ###
  if supress_outputs == False:
      print("CV time", round(time()-t0, 3), "s")

  estimators = scores['estimator']
  del scores['estimator']
  scores = pd.DataFrame(scores).T
  summary = scores.assign(
        mean = lambda x: x.mean(axis=1),
        std = lambda x: x.std(axis=1),
    )

  if supress_outputs == False:
      display(summary[['mean', 'std']])

  t0 = time()
  y_train_pred = clf.predict(X_train)
  y_test_pred = clf.predict(X_test)
  if supress_outputs == False:
      print("predict time", round(time()-t0, 3), "s")

  if supress_outputs == False:
    print("Confusion matrix: ")
  # Print the confusion matrix computed from the test set (1 line of code only)
  ### START CODE ###
  #print(confusion_matrix(y_test, y_pred))
  # ConfusionMatrixDisplay.from_estimator(clf, X_train, y_train)
  if confusion == True:
      ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test)
      plt.show()
  ### END CODE ###


  ### START CODE ###
  y_train_pred_proba = clf.predict_proba(X_train)
  y_test_pred_proba = clf.predict_proba(X_test)
    
  if debug == True:
      print(y_test_pred_proba)

  train_acc_score = accuracy_score(y_train, y_train_pred)
  test_acc_score = accuracy_score(y_test, y_test_pred)

  train_auc_score_ovo = roc_auc_score(y_train, y_train_pred_proba, multi_class='ovo')  
  train_auc_score_ovr = roc_auc_score(y_train, y_train_pred_proba, multi_class='ovr')
    
  test_auc_score_ovo = roc_auc_score(y_test, y_test_pred_proba, multi_class='ovo')  
  test_auc_score_ovr = roc_auc_score(y_test, y_test_pred_proba, multi_class='ovr')
  ### END CODE ###
    
  if supress_outputs == False:
      print("TRAIN - Accuracy: {}, AUC_ROC_OVO: {}, AUC_ROC_OVR: {}".format(train_acc_score, train_auc_score_ovo, train_auc_score_ovr))
      print("TEST - Accuracy: {}, AUC_ROC_OVO: {}, AUC_ROC_OVR: {}".format(test_acc_score, test_auc_score_ovo, test_auc_score_ovr))

  # Display Learning curve
  if learning_curve == True:
      LearningCurveDisplay.from_estimator(clf, X_train, y_train, scoring='accuracy')
  #RocCurveDisplay.from_estimator(clf, X_test, y_test)

  train_results = [train_acc_score, train_auc_score_ovo, train_auc_score_ovr]
  test_results = [test_acc_score, test_auc_score_ovo, test_auc_score_ovr]
  return train_results, test_results, summary

In [4]:
df1 = pd.read_csv('dataset/ObesityDataSet.csv')

In [5]:
y1 = df1['NObeyesdad']
X1 = df1.drop(columns='NObeyesdad')

X1 = OrdinalEncoder().fit_transform(X1, y1)

In [6]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=1/3, random_state=42)

In [7]:
model1 = DecisionTreeClassifier()
train_results1, test_results1 = train_and_evaluate_classifier(f"Decision Tree", model1, X1_train, y1_train, X1_test, y1_test, confusion=False, supress_outputs=False)

Decision Tree
training time 0.016 s
predict time 0.002 s
Confusion matrix: 
TRAIN - Accuracy: 1.0, AUC_ROC_OVO: 1.0, AUC_ROC_OVR: 1.0
TEST - Accuracy: 0.9190340909090909, AUC_ROC_OVO: 0.9525085961026324, AUC_ROC_OVR: 0.952578873844009


In [8]:
df2 = pd.read_csv('dataset/train.csv')
df2 = df2.drop(columns='id')

In [9]:
y2 = df2['NObeyesdad']
X2 = df2.drop(columns='NObeyesdad')

X2 = OrdinalEncoder().fit_transform(X2, y2)

In [10]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=1/3, random_state=42)

In [11]:
model2 = DecisionTreeClassifier()
train_results2, test_results2 = train_and_evaluate_classifier(f"Decision Tree", model2, X2_train, y2_train, X2_test, y2_test, confusion=False, supress_outputs=False)

Decision Tree
training time 0.134 s
predict time 0.005 s
Confusion matrix: 
TRAIN - Accuracy: 1.0, AUC_ROC_OVO: 1.0, AUC_ROC_OVR: 1.0
TEST - Accuracy: 0.846242774566474, AUC_ROC_OVO: 0.900865959362747, AUC_ROC_OVR: 0.9023843321248569


In [12]:
df_final = pd.concat([df1, df2], ignore_index=True)
df_final

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.000000,3.000000,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.000000,3.000000,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.000000,3.000000,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.000000,3.000000,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.000000,1.000000,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22864,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
22865,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
22866,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
22867,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


In [13]:
df_final.iloc[20758:, :]

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
20758,Male,31.540751,1.750000,118.805937,yes,yes,2.145114,3.000000,Sometimes,no,2.000000,no,0.885633,1.668318,Sometimes,Automobile,Obesity_Type_II
20759,Male,19.000000,1.770000,79.000000,yes,yes,3.000000,3.000000,Sometimes,no,2.000000,no,1.000000,0.000000,Frequently,Public_Transportation,Overweight_Level_I
20760,Female,18.940930,1.746529,133.472641,yes,yes,3.000000,3.000000,Sometimes,no,2.868132,no,1.501754,0.825609,Sometimes,Public_Transportation,Obesity_Type_III
20761,Female,26.000000,1.600000,71.000000,yes,yes,3.000000,3.000000,Sometimes,no,3.000000,no,0.000000,0.000000,Sometimes,Automobile,Overweight_Level_I
20762,Female,17.504873,1.710948,50.000000,yes,yes,3.000000,3.671076,Sometimes,no,2.000000,no,1.304291,1.000000,Sometimes,Automobile,Insufficient_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22864,Male,25.137087,1.766626,114.187096,yes,yes,2.919584,3.000000,Sometimes,no,2.151809,no,1.330519,0.196680,Sometimes,Public_Transportation,Obesity_Type_II
22865,Male,18.000000,1.710000,50.000000,no,yes,3.000000,4.000000,Frequently,no,1.000000,no,2.000000,1.000000,Sometimes,Public_Transportation,Insufficient_Weight
22866,Male,20.101026,1.819557,105.580491,yes,yes,2.407817,3.000000,Sometimes,no,2.000000,no,1.158040,1.198439,no,Public_Transportation,Obesity_Type_II
22867,Male,33.852953,1.700000,83.520113,yes,yes,2.671238,1.971472,Sometimes,no,2.144838,no,0.000000,0.973834,no,Automobile,Overweight_Level_II


In [14]:
a = pd.DataFrame({'A':[0]}, index=[0])
b = pd.DataFrame({'A':[10]}, index=[2])

c = pd.concat([a, b], ignore_index=True)
c

Unnamed: 0,A
0,0
1,10


In [15]:
y3 = df_final['NObeyesdad']
X3 = df_final.drop(columns='NObeyesdad')

X3 = OrdinalEncoder().fit_transform(X3, y3)

In [16]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=1/4, random_state=42)

In [17]:
model3 = DecisionTreeClassifier()
train_results3, test_results3, summary = train_and_evaluate_classifierCV(f"Decision Tree", model3, X3_train, y3_train, X3_test, y3_test, confusion=False, supress_outputs=False)

Decision Tree
CV time 1.522 s


Unnamed: 0,mean,std
fit_time,0.132411,0.001050476
score_time,0.042523,0.001553175
test_accuracy,0.853653,0.00260676
train_accuracy,0.999898,5.830426e-05
test_roc_auc_ovo_weighted,0.910647,0.00143806
train_roc_auc_ovo_weighted,1.0,9.257741e-09
test_roc_auc_ovr_weighted,0.915637,0.001436604
train_roc_auc_ovr_weighted,1.0,7.639538e-09


predict time 0.006 s
Confusion matrix: 
TRAIN - Accuracy: 0.9998833887236895, AUC_ROC_OVO: 0.999999983416084, AUC_ROC_OVR: 0.9999999862646322
TEST - Accuracy: 0.8462749213011542, AUC_ROC_OVO: 0.9024580200686374, AUC_ROC_OVR: 0.9037519617059884


In [19]:
summary['mean']['test_accuracy']

0.8536528632354804

In [23]:
BaggedTrees = BaggingClassifier(DecisionTreeClassifier(), n_estimators=10)
train_results4, test_results4 = train_and_evaluate_classifierCV(f"Bagged Decision Tree", BaggedTrees, X3_train, y3_train, X3_test, y3_test, confusion=False, supress_outputs=False)

Bagged Decision Tree
CV time 7.206 s


Unnamed: 0,mean,std
fit_time,0.970201,0.005011
score_time,0.062215,0.00106
test_accuracy,0.887995,0.003675
train_accuracy,0.992027,0.000833
test_roc_auc_ovo_weighted,0.975598,0.001321
train_roc_auc_ovo_weighted,0.999932,1.2e-05
test_roc_auc_ovr_weighted,0.977285,0.001219
train_roc_auc_ovr_weighted,0.99994,1e-05


predict time 0.063 s
Confusion matrix: 
TRAIN - Accuracy: 0.9910209317240978, AUC_ROC_OVO: 0.9999044757905234, AUC_ROC_OVR: 0.9999138512672869
TEST - Accuracy: 0.8894718433018538, AUC_ROC_OVO: 0.9741382898468122, AUC_ROC_OVR: 0.9752684318788672


In [26]:
RF = RandomForestClassifier()
train_resultsRF, test_resultsRF = train_and_evaluate_classifierCV(f"Random Forests", RF, X3_train, y3_train, X3_test, y3_test, confusion=False, supress_outputs=False)

Random Forests
CV time 23.398 s


Unnamed: 0,mean,std
fit_time,3.153356,0.02075104
score_time,0.184377,0.003051711
test_accuracy,0.904495,0.003689506
train_accuracy,0.999898,5.830426e-05
test_roc_auc_ovo_weighted,0.988201,0.0004203241
train_roc_auc_ovo_weighted,1.0,3.23234e-08
test_roc_auc_ovr_weighted,0.989358,0.0003747191
train_roc_auc_ovr_weighted,1.0,2.704175e-08


predict time 0.413 s
Confusion matrix: 
TRAIN - Accuracy: 0.9998833887236895, AUC_ROC_OVO: 0.9999999669113274, AUC_ROC_OVR: 0.9999999726588815
TEST - Accuracy: 0.9038125218607905, AUC_ROC_OVO: 0.9871561693830696, AUC_ROC_OVR: 0.9883031335521568


In [19]:
LR = LogisticRegression(max_iter = 5000)
train_resultsLR, test_resultsLR = train_and_evaluate_classifierCV(f"Logistic Regression", LR, X3_train, y3_train, X3_test, y3_test, confusion=False, supress_outputs=False)

Logistic Regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the doc

CV time 80.978 s


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,mean,std
fit_time,12.849807,0.112912
score_time,0.047837,0.000941
test_accuracy,0.841816,0.01043
train_accuracy,0.844003,0.003713
test_roc_auc_ovo_weighted,0.974543,0.002083
train_roc_auc_ovo_weighted,0.975134,0.000515
test_roc_auc_ovr_weighted,0.976977,0.001915
train_roc_auc_ovr_weighted,0.97751,0.000482


predict time 0.003 s
Confusion matrix: 
TRAIN - Accuracy: 0.8452568363360737, AUC_ROC_OVO: 0.9730940506050932, AUC_ROC_OVR: 0.9755941933612438
TEST - Accuracy: 0.8537950332284016, AUC_ROC_OVO: 0.9742166552589269, AUC_ROC_OVR: 0.9765679762483475


In [21]:
LDA = LinearDiscriminantAnalysis()
train_resultsLDA, test_resultsLDA = train_and_evaluate_classifierCV(f"LDA", LDA, X3_train, y3_train, X3_test, y3_test, confusion=False, supress_outputs=False)

LDA
CV time 1.134 s


Unnamed: 0,mean,std
fit_time,0.049439,0.000974
score_time,0.049934,0.000761
test_accuracy,0.78322,0.007476
train_accuracy,0.78529,0.001497
test_roc_auc_ovo_weighted,0.960863,0.001653
train_roc_auc_ovo_weighted,0.96137,0.000403
test_roc_auc_ovr_weighted,0.96455,0.00145
train_roc_auc_ovr_weighted,0.965007,0.00036


predict time 0.003 s
Confusion matrix: 
TRAIN - Accuracy: 0.7857850854177599, AUC_ROC_OVO: 0.957932212718097, AUC_ROC_OVR: 0.9619672014901136
TEST - Accuracy: 0.7885624344176285, AUC_ROC_OVO: 0.9578879345332195, AUC_ROC_OVR: 0.9618476011311505
