In [19]:
import warnings
warnings.filterwarnings("ignore")

import streamlit as st

import joblib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import set_config

from imblearn.pipeline import make_pipeline
import imblearn


import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import Booster

st.title('Obesity_classifier')

# Set global config for scikit-learn
set_config(display='diagram')

X_train = pd.read_csv("data/cleaned_train_obesity")
X_train = X_train.drop(columns="Unnamed: 0")

y_train = pd.read_csv("data/y_train_obesity")
y_train = y_train.drop(columns="Unnamed: 0")

X_test = pd.read_csv("data/cleaned_test_obesity")
X_test = X_test.drop(columns="Unnamed: 0")

y_test = pd.read_csv("data/y_test_obesity")
y_test = y_test.drop(columns="Unnamed: 0")


pipeline = make_pipeline(
    StandardScaler(),
    xgb.XGBClassifier())

# Train the pipeline (tranformations & predictor)
pipeline.fit(X_train, y_train)

pipe_grid = {
    'xgbclassifier__learning_rate': [0.1],
    'xgbclassifier__n_estimators': [200],
    'xgbclassifier__max_depth': [7],
    'xgbclassifier__min_child_weight': [1],
    'xgbclassifier__gamma': [0],
    'xgbclassifier__subsample': [0.8],
    'xgbclassifier__colsample_bytree': [0.6]
}

second_pipe = GridSearchCV(
    estimator=pipeline,
    param_grid=pipe_grid,
    verbose=2,
    n_jobs=-1,
    cv=5
)

second_pipe.fit(X_train, y_train)
display(second_pipe.best_params_)
display(second_pipe.best_score_)
display(second_pipe.best_estimator_)
best_model = second_pipe.best_estimator_
best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)

feat_imp = best_model["xgbclassifier"].feature_importances_
feat_imp_series = pd.Series(feat_imp, 
    index = X_train.columns).sort_values(
    ascending = False)
display(feat_imp)
display(X_train.columns)

print("accuracy on test", best_model.score(X_test, y_test))
print("recall = ", recall_score(y_test, y_pred, average="macro"))
print("f1_score = ", f1_score(y_test, y_pred, average="macro"))

dtrain = xgb.DMatrix(X_train, label=y_train)
bst = xgb.train(pipe_grid, dtrain)
bst.save_model('data/model_obesity.model')

display(confusion_matrix(y_test,y_pred));

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__gamma=0, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, xgbclassifier__n_estimators=200, xgbclassifier__subsample=0.8; total time=   3.2s
[CV] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__gamma=0, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, xgbclassifier__n_estimators=200, xgbclassifier__subsample=0.8; total time=   3.2s
[CV] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__gamma=0, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, xgbclassifier__n_estimators=200, xgbclassifier__subsample=0.8; total time=   3.2s
[CV] END xgbclassifier__colsample_bytree=0.6, xgbclassifier__gamma=0, xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=7, xgbclassifier__min_child_weight=1, xgbclassifier__n_estimators

{'xgbclassifier__colsample_bytree': 0.6,
 'xgbclassifier__gamma': 0,
 'xgbclassifier__learning_rate': 0.1,
 'xgbclassifier__max_depth': 7,
 'xgbclassifier__min_child_weight': 1,
 'xgbclassifier__n_estimators': 200,
 'xgbclassifier__subsample': 0.8}

0.7950169437957614

array([0.11432027, 0.06272369, 0.12640324, 0.07093421, 0.07636472,
       0.06553411, 0.11182186, 0.04112321, 0.03709768, 0.0663784 ,
       0.04186971, 0.04007898, 0.07396463, 0.07138526], dtype=float32)

Index(['Gender', 'Age', 'family_history_with_overweight', 'FAVC', 'FCVC',
       'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS'],
      dtype='object')

accuracy on test 0.8037825059101655
recall =  0.8027733168127257
f1_score =  0.7997581009802529
Parameters: { "xgbclassifier__colsample_bytree", "xgbclassifier__gamma", "xgbclassifier__learning_rate", "xgbclassifier__max_depth", "xgbclassifier__min_child_weight", "xgbclassifier__n_estimators", "xgbclassifier__subsample" } are not used.



array([[49,  2,  1,  0,  2,  0,  0],
       [ 7, 34,  7,  5,  5,  0,  0],
       [ 1,  8, 40,  5,  3,  1,  0],
       [ 1,  3,  3, 46,  1,  4,  0],
       [ 0,  1,  7,  4, 53,  4,  1],
       [ 0,  2,  1,  1,  1, 55,  0],
       [ 0,  1,  0,  1,  0,  0, 63]])

In [41]:
import warnings
warnings.filterwarnings("ignore")

import streamlit as st

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import XGBClassifier

X_train = pd.read_csv("data/cleaned_train_obesity")
X_train = X_train.drop(columns="Unnamed: 0")

y_train = pd.read_csv("data/y_train_obesity")
y_train = y_train.drop(columns="Unnamed: 0")

X_test = pd.read_csv("data/cleaned_test_obesity")
X_test = X_test.drop(columns="Unnamed: 0")

y_test = pd.read_csv("data/y_test_obesity")
y_test = y_test.drop(columns="Unnamed: 0")


pipeline = Pipeline([("scaler", StandardScaler()),
    ("xgb", XGBClassifier(learning_rate=.1, n_estimators=200, max_depth=7, min_chil_weight=1, gamma=0, subsample=.8, colsample_bytree=.6))])

# Train the pipeline (tranformations & predictor)


best_model = pipeline.fit(X_train, y_train)
best_model['xgb'].save_model('data/model_obesity_alex.model')
display(best_model)
y_pred = best_model.predict(X_test)

print("accuracy on test", best_model.score(X_test, y_test))
print("recall = ", recall_score(y_test, y_pred, average="macro"))
print("f1_score = ", f1_score(y_test, y_pred, average="macro"))

# dtrain = xgb.DMatrix(X_train, label=y_train)
# bst = xgb.train(pipe_grid, dtrain)
# bst.save_model('data/model_obesity.model')

# display(confusion_matrix(y_test,y_pred));

Parameters: { "min_chil_weight" } are not used.



accuracy on test 0.8181818181818182
recall =  0.8214285714285714
f1_score =  0.8091836734693878


In [42]:
X_train = pd.read_csv('data/cleaned_train_obesity', index_col=0)
X_train

Unnamed: 0,Gender,Age,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
151,0,3.401197,1,1,2.0,3.0,1.0,0,2.0,0,2.0,0.0,1.0,0
715,1,2.833213,1,1,1.0,3.0,1.0,0,2.0,0,3.0,1.0,0.0,0
184,1,3.044522,0,1,1.0,1.0,0.0,0,3.0,0,1.0,0.0,1.0,3
691,1,2.890372,0,1,2.0,3.0,1.0,0,2.0,0,1.0,1.0,1.0,3
2056,0,3.258097,1,1,2.0,3.0,1.0,0,3.0,0,0.0,0.0,1.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,1,2.944439,1,1,1.0,4.0,1.0,0,3.0,0,2.0,2.0,0.0,0
1057,1,3.178054,1,1,1.0,3.0,1.0,0,3.0,0,0.0,1.0,1.0,3
992,0,3.044522,1,0,1.0,3.0,1.0,0,2.0,0,1.0,1.0,0.0,3
1465,1,3.091042,1,1,1.0,3.0,1.0,0,3.0,0,0.0,1.0,1.0,3
