# Imports

In [129]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

from typing import Dict
import pickle
from IPython.display import display

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
# pd.options.display.max_rows = None
# pd.options.display.float_format = '{:.2f}'.format

# Pre-process Data

In [12]:
df_raw = pd.read_csv("../data/ObesityDataSet_raw_and_data_sinthetic.csv")
df_raw.columns = ['gender', 'age', 'height', 'weight', 'family_history_with_overweight',
'frequent_consumption_of_high_caloric_food', 'frequency_of_consumption_of_vegetables', 'number_of_main_meals', 'consumption_of_food_between_meals', 'smoke', 'consumption_of_water_daily', 'calories_consumption_monitoring', 'physical_activity_frequency', 'time_using_technology_devices',
'consumption_of_alcohol', 'transportation_used', 'obesity']
df_proc = df_raw.copy()
df_proc["age"] = df_proc["age"].round(0).astype(int)
df_proc["height"] = df_proc["height"].round(2)
df_proc["weight"] = df_proc["weight"].round(1)
df_proc["frequency_of_consumption_of_vegetables"] = df_proc["frequency_of_consumption_of_vegetables"].round(0).astype(int)
df_proc["number_of_main_meals"] = df_proc["number_of_main_meals"].round(0).astype(int)
df_proc["consumption_of_water_daily"] = df_proc["consumption_of_water_daily"].round(0).astype(int)
df_proc["physical_activity_frequency"] = df_proc["physical_activity_frequency"].round(0).astype(int)
df_proc["time_using_technology_devices"] = df_proc["time_using_technology_devices"].round(0).astype(int)

print(df_raw.shape, df_proc.shape)
# display(df_proc.describe())
# display(df_proc.head(20))
# display(df_proc.tail(20))
display(df_proc)
# display(df_raw)

col_categorical = df_proc.dtypes == object
col_categorical = col_categorical[col_categorical].index
print(list(col_categorical))

(2111, 17) (2111, 17)


Unnamed: 0,gender,age,height,weight,family_history_with_overweight,frequent_consumption_of_high_caloric_food,frequency_of_consumption_of_vegetables,number_of_main_meals,consumption_of_food_between_meals,smoke,consumption_of_water_daily,calories_consumption_monitoring,physical_activity_frequency,time_using_technology_devices,consumption_of_alcohol,transportation_used,obesity
0,Female,21,1.62,64.0,yes,no,2,3,Sometimes,no,2,no,0,1,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3,3,Sometimes,yes,3,yes,3,0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.80,77.0,yes,no,2,3,Sometimes,no,2,no,2,1,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.80,87.0,no,no,3,3,Sometimes,no,2,no,2,0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2,1,Sometimes,no,2,no,0,0,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,21,1.71,131.4,yes,yes,3,3,Sometimes,no,2,no,2,1,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,22,1.75,133.7,yes,yes,3,3,Sometimes,no,2,no,1,1,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,23,1.75,133.7,yes,yes,3,3,Sometimes,no,2,no,1,1,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24,1.74,133.3,yes,yes,3,3,Sometimes,no,3,no,1,1,Sometimes,Public_Transportation,Obesity_Type_III


['gender', 'family_history_with_overweight', 'frequent_consumption_of_high_caloric_food', 'consumption_of_food_between_meals', 'smoke', 'calories_consumption_monitoring', 'consumption_of_alcohol', 'transportation_used', 'obesity']


In [13]:
# ordinal: consumption_of_food_between_meals, consumption_of_alcohol, obesity
# one hot: gender, family_history_with_overweight, frequent_consumption_of_high_caloric_food, smoke, calories_consumption_monitoring, transportation_used

def encode_input(df):
    ordinal_cols = ['consumption_of_food_between_meals', 'consumption_of_alcohol']
    one_hot_cols = ['gender', 'family_history_with_overweight', 'frequent_consumption_of_high_caloric_food', 'smoke', 'calories_consumption_monitoring', 'transportation_used']

    # ordinal 
    enc_freq = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
    for ordinal_col in ordinal_cols:
        df[ordinal_col] = df[ordinal_col].map(enc_freq)

    # one-hot
    df_one_hot = pd.get_dummies(df[one_hot_cols])

    df = df.merge(df_one_hot, left_index=True, right_index=True)
    df = df.drop(columns=one_hot_cols)
    return df

df = df_proc.copy()
df = encode_input(df)

In [51]:
display(df_proc.head(10))
display(df.head(10))
display(pd.get_dummies(df_proc.head(1)))

Unnamed: 0,gender,age,height,weight,family_history_with_overweight,frequent_consumption_of_high_caloric_food,frequency_of_consumption_of_vegetables,number_of_main_meals,consumption_of_food_between_meals,smoke,consumption_of_water_daily,calories_consumption_monitoring,physical_activity_frequency,time_using_technology_devices,consumption_of_alcohol,transportation_used,obesity
0,Female,21,1.62,64.0,yes,no,2,3,Sometimes,no,2,no,0,1,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3,3,Sometimes,yes,3,yes,3,0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2,3,Sometimes,no,2,no,2,1,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3,3,Sometimes,no,2,no,2,0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2,1,Sometimes,no,2,no,0,0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29,1.62,53.0,no,yes,2,3,Sometimes,no,2,no,0,0,Sometimes,Automobile,Normal_Weight
6,Female,23,1.5,55.0,yes,yes,3,3,Sometimes,no,2,no,1,0,Sometimes,Motorbike,Normal_Weight
7,Male,22,1.64,53.0,no,no,2,3,Sometimes,no,2,no,3,0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24,1.78,64.0,yes,yes,3,3,Sometimes,no,2,no,1,1,Frequently,Public_Transportation,Normal_Weight
9,Male,22,1.72,68.0,yes,yes,2,3,Sometimes,no,2,no,1,1,no,Public_Transportation,Normal_Weight


Unnamed: 0,age,height,weight,frequency_of_consumption_of_vegetables,number_of_main_meals,consumption_of_food_between_meals,consumption_of_water_daily,physical_activity_frequency,time_using_technology_devices,consumption_of_alcohol,obesity,gender_Female,gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,frequent_consumption_of_high_caloric_food_no,frequent_consumption_of_high_caloric_food_yes,smoke_no,smoke_yes,calories_consumption_monitoring_no,calories_consumption_monitoring_yes,transportation_used_Automobile,transportation_used_Bike,transportation_used_Motorbike,transportation_used_Public_Transportation,transportation_used_Walking
0,21,1.62,64.0,2,3,1,2,0,1,0,Normal_Weight,1,0,0,1,1,0,1,0,1,0,0,0,0,1,0
1,21,1.52,56.0,3,3,1,3,3,0,1,Normal_Weight,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0
2,23,1.8,77.0,2,3,1,2,2,1,2,Normal_Weight,0,1,0,1,1,0,1,0,1,0,0,0,0,1,0
3,27,1.8,87.0,3,3,1,2,2,0,2,Overweight_Level_I,0,1,1,0,1,0,1,0,1,0,0,0,0,0,1
4,22,1.78,89.8,2,1,1,2,0,0,1,Overweight_Level_II,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0
5,29,1.62,53.0,2,3,1,2,0,0,1,Normal_Weight,0,1,1,0,0,1,1,0,1,0,1,0,0,0,0
6,23,1.5,55.0,3,3,1,2,1,0,1,Normal_Weight,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0
7,22,1.64,53.0,2,3,1,2,3,0,1,Normal_Weight,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0
8,24,1.78,64.0,3,3,1,2,1,1,2,Normal_Weight,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0
9,22,1.72,68.0,2,3,1,2,1,1,0,Normal_Weight,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0


Unnamed: 0,age,height,weight,frequency_of_consumption_of_vegetables,number_of_main_meals,consumption_of_water_daily,physical_activity_frequency,time_using_technology_devices,gender_Female,family_history_with_overweight_yes,frequent_consumption_of_high_caloric_food_no,consumption_of_food_between_meals_Sometimes,smoke_no,calories_consumption_monitoring_no,consumption_of_alcohol_no,transportation_used_Public_Transportation,obesity_Normal_Weight
0,21,1.62,64.0,2,3,2,0,1,1,1,1,1,1,1,1,1,1


In [54]:
x = df.drop(columns=['obesity'])
y = df['obesity']
display(x.head())
display(y.head())
feature_names = x.columns.to_list()
print(feature_names)

Unnamed: 0,age,height,weight,frequency_of_consumption_of_vegetables,number_of_main_meals,consumption_of_food_between_meals,consumption_of_water_daily,physical_activity_frequency,time_using_technology_devices,consumption_of_alcohol,gender_Female,gender_Male,family_history_with_overweight_no,family_history_with_overweight_yes,frequent_consumption_of_high_caloric_food_no,frequent_consumption_of_high_caloric_food_yes,smoke_no,smoke_yes,calories_consumption_monitoring_no,calories_consumption_monitoring_yes,transportation_used_Automobile,transportation_used_Bike,transportation_used_Motorbike,transportation_used_Public_Transportation,transportation_used_Walking
0,21,1.62,64.0,2,3,1,2,0,1,0,1,0,0,1,1,0,1,0,1,0,0,0,0,1,0
1,21,1.52,56.0,3,3,1,3,3,0,1,1,0,0,1,1,0,0,1,0,1,0,0,0,1,0
2,23,1.8,77.0,2,3,1,2,2,1,2,0,1,0,1,1,0,1,0,1,0,0,0,0,1,0
3,27,1.8,87.0,3,3,1,2,2,0,2,0,1,1,0,1,0,1,0,1,0,0,0,0,0,1
4,22,1.78,89.8,2,1,1,2,0,0,1,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0


0          Normal_Weight
1          Normal_Weight
2          Normal_Weight
3     Overweight_Level_I
4    Overweight_Level_II
Name: obesity, dtype: object

['age', 'height', 'weight', 'frequency_of_consumption_of_vegetables', 'number_of_main_meals', 'consumption_of_food_between_meals', 'consumption_of_water_daily', 'physical_activity_frequency', 'time_using_technology_devices', 'consumption_of_alcohol', 'gender_Female', 'gender_Male', 'family_history_with_overweight_no', 'family_history_with_overweight_yes', 'frequent_consumption_of_high_caloric_food_no', 'frequent_consumption_of_high_caloric_food_yes', 'smoke_no', 'smoke_yes', 'calories_consumption_monitoring_no', 'calories_consumption_monitoring_yes', 'transportation_used_Automobile', 'transportation_used_Bike', 'transportation_used_Motorbike', 'transportation_used_Public_Transportation', 'transportation_used_Walking']


In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [49]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
display(y_train.value_counts(), y_test.value_counts())

(1688, 25) (423, 25) (1688,) (423,)


Obesity_Type_I         281
Obesity_Type_III       259
Obesity_Type_II        237
Overweight_Level_I     232
Overweight_Level_II    232
Normal_Weight          229
Insufficient_Weight    218
Name: obesity, dtype: int64

Obesity_Type_I         70
Obesity_Type_III       65
Obesity_Type_II        60
Normal_Weight          58
Overweight_Level_II    58
Overweight_Level_I     58
Insufficient_Weight    54
Name: obesity, dtype: int64

# Train Model

In [56]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42).fit(x_train, y_train)
clf.score(x_test, y_test)

0.9621749408983451

# Export

In [None]:
%cd ../app

/home/alvinchiew/git_repos/obesity-ai-service/app


In [143]:
pathname = '.model/obesity_predictor_gboost.pkl'
pickle.dump(clf, open(pathname, 'wb'))

# Test

In [None]:
with open('.model/obesity_predictor_gboost.pkl', 'rb') as f:
    clf = pickle.load(f)

FEATURE_NAMES = ['age', 'height', 'weight', 'frequency_of_consumption_of_vegetables', 'number_of_main_meals', 'consumption_of_food_between_meals', 'consumption_of_water_daily', 'physical_activity_frequency', 'time_using_technology_devices', 'consumption_of_alcohol', 'gender_Female', 'gender_Male', 'family_history_with_overweight_no', 'family_history_with_overweight_yes', 'frequent_consumption_of_high_caloric_food_no', 'frequent_consumption_of_high_caloric_food_yes', 'smoke_no', 'smoke_yes', 'calories_consumption_monitoring_no', 'calories_consumption_monitoring_yes', 'transportation_used_Automobile', 'transportation_used_Bike', 'transportation_used_Motorbike', 'transportation_used_Public_Transportation', 'transportation_used_Walking']

def data_to_input(data: Dict):
    df = pd.DataFrame(data=[data.values()], index=[0], columns=data.keys())
    dict_raw = encode_input(df).squeeze().to_dict()
    dict_input = {}
    for feature in FEATURE_NAMES:
        if feature in dict_raw.keys():
            dict_input[feature] = dict_raw[feature]
        else:
            dict_input[feature] = 0

    df_input = pd.DataFrame(data=[dict_input.values()], index=[0], columns=dict_input.keys())
    return df_input


In [181]:
display(df['obesity'].drop_duplicates())
display(y_test[y_test == 'Normal_Weight'].sample(5))
display(y_test[y_test == 'Obesity_Type_I'].sample(5))

0            Normal_Weight
3       Overweight_Level_I
4      Overweight_Level_II
10          Obesity_Type_I
59     Insufficient_Weight
68         Obesity_Type_II
202       Obesity_Type_III
Name: obesity, dtype: object

208    Normal_Weight
386    Normal_Weight
399    Normal_Weight
44     Normal_Weight
343    Normal_Weight
Name: obesity, dtype: object

1414    Obesity_Type_I
1378    Obesity_Type_I
1287    Obesity_Type_I
186     Obesity_Type_I
1498    Obesity_Type_I
Name: obesity, dtype: object

In [185]:
idx = 53  # Normal_Weight

data = df_raw.loc[idx, :].drop('obesity').to_dict()
df_input = data_to_input(data)

display(data)
display(df_input.squeeze())
print(clf.predict(df_input))

{'gender': 'Female',
 'age': 23.0,
 'height': 1.63,
 'weight': 55.0,
 'family_history_with_overweight': 'yes',
 'frequent_consumption_of_high_caloric_food': 'no',
 'frequency_of_consumption_of_vegetables': 3.0,
 'number_of_main_meals': 3.0,
 'consumption_of_food_between_meals': 'no',
 'smoke': 'no',
 'consumption_of_water_daily': 2.0,
 'calories_consumption_monitoring': 'yes',
 'physical_activity_frequency': 2.0,
 'time_using_technology_devices': 1.0,
 'consumption_of_alcohol': 'no',
 'transportation_used': 'Public_Transportation'}

age                                              23.00
height                                            1.63
weight                                           55.00
frequency_of_consumption_of_vegetables            3.00
number_of_main_meals                              3.00
consumption_of_food_between_meals                 0.00
consumption_of_water_daily                        2.00
physical_activity_frequency                       2.00
time_using_technology_devices                     1.00
consumption_of_alcohol                            0.00
gender_Female                                     1.00
gender_Male                                       0.00
family_history_with_overweight_no                 0.00
family_history_with_overweight_yes                1.00
frequent_consumption_of_high_caloric_food_no      1.00
frequent_consumption_of_high_caloric_food_yes     0.00
smoke_no                                          1.00
smoke_yes                                         0.00
calories_c

['Normal_Weight']


In [186]:
idx = 186  # Obesity_Type_I

data = df_raw.loc[idx, :].drop('obesity').to_dict()
df_input = data_to_input(data)

display(data)
display(df_input.squeeze())
print(clf.predict(df_input))

{'gender': 'Male',
 'age': 39.0,
 'height': 1.78,
 'weight': 96.0,
 'family_history_with_overweight': 'yes',
 'frequent_consumption_of_high_caloric_food': 'no',
 'frequency_of_consumption_of_vegetables': 2.0,
 'number_of_main_meals': 3.0,
 'consumption_of_food_between_meals': 'Sometimes',
 'smoke': 'no',
 'consumption_of_water_daily': 3.0,
 'calories_consumption_monitoring': 'no',
 'physical_activity_frequency': 1.0,
 'time_using_technology_devices': 0.0,
 'consumption_of_alcohol': 'Frequently',
 'transportation_used': 'Automobile'}

age                                              39.00
height                                            1.78
weight                                           96.00
frequency_of_consumption_of_vegetables            2.00
number_of_main_meals                              3.00
consumption_of_food_between_meals                 1.00
consumption_of_water_daily                        3.00
physical_activity_frequency                       1.00
time_using_technology_devices                     0.00
consumption_of_alcohol                            2.00
gender_Female                                     0.00
gender_Male                                       1.00
family_history_with_overweight_no                 0.00
family_history_with_overweight_yes                1.00
frequent_consumption_of_high_caloric_food_no      1.00
frequent_consumption_of_high_caloric_food_yes     0.00
smoke_no                                          1.00
smoke_yes                                         0.00
calories_c

['Obesity_Type_I']


In [188]:
data.keys()

dict_keys(['gender', 'age', 'height', 'weight', 'family_history_with_overweight', 'frequent_consumption_of_high_caloric_food', 'frequency_of_consumption_of_vegetables', 'number_of_main_meals', 'consumption_of_food_between_meals', 'smoke', 'consumption_of_water_daily', 'calories_consumption_monitoring', 'physical_activity_frequency', 'time_using_technology_devices', 'consumption_of_alcohol', 'transportation_used'])

# Future Work

- Perform feature importance to eliminate low impact features

In [191]:
col_categorical = df_raw.dtypes == object
col_categorical = col_categorical[col_categorical].index
print(list(col_categorical))

['gender', 'family_history_with_overweight', 'frequent_consumption_of_high_caloric_food', 'consumption_of_food_between_meals', 'smoke', 'calories_consumption_monitoring', 'consumption_of_alcohol', 'transportation_used', 'obesity']


In [200]:
valid_options = {}
for col in col_categorical:
    if col == "obesity":
        continue
    valid_options[col] = list(df_raw[col].drop_duplicates().values)

print(valid_options)

{'gender': ['Female', 'Male'], 'family_history_with_overweight': ['yes', 'no'], 'frequent_consumption_of_high_caloric_food': ['no', 'yes'], 'consumption_of_food_between_meals': ['Sometimes', 'Frequently', 'Always', 'no'], 'smoke': ['no', 'yes'], 'calories_consumption_monitoring': ['no', 'yes'], 'consumption_of_alcohol': ['no', 'Sometimes', 'Frequently', 'Always'], 'transportation_used': ['Public_Transportation', 'Walking', 'Automobile', 'Motorbike', 'Bike']}
