In [94]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer

In [107]:
# Load the data
obesity_train_raw = pd.read_csv('data/obesity_train.csv')
obesity_test_raw = pd.read_csv('data/obesity_test.csv')

In [108]:
obesity_train_raw

Unnamed: 0,id,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,marrital_status,meals_perday,...,parent_overweight,physical_activity_perweek,region,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level
0,1,21.0,Never,no,up to 5,Sometimes,Female,1.62,,3.0,...,yes,,LatAm,3.0,no,Public,Sometimes,1 to 2,64.0,Normal_Weight
1,2,23.0,Frequently,no,up to 5,Sometimes,Male,1.80,,3.0,...,yes,3 to 4,LatAm,0.0,no,Public,Sometimes,1 to 2,77.0,Normal_Weight
2,3,,Frequently,no,up to 2,Sometimes,Male,1.80,,3.0,...,no,3 to 4,LatAm,2.0,no,Walk,Always,1 to 2,87.0,Overweight_Level_I
3,4,22.0,Sometimes,no,up to 2,Sometimes,Male,1.78,,1.0,...,no,,LatAm,3.0,no,Public,Sometimes,1 to 2,90.0,Overweight_Level_II
4,5,22.0,Sometimes,no,up to 2,Sometimes,Male,1.64,,3.0,...,no,5 or more,LatAm,3.0,no,Public,Sometimes,1 to 2,53.0,Normal_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,1607,21.0,Sometimes,,up to 5,Sometimes,Female,1.73,,3.0,...,yes,3 to 4,LatAm,1.0,no,Public,Always,1 to 2,131.0,Obesity_Type_III
1607,1608,22.0,Sometimes,yes,up to 5,Sometimes,Female,1.75,,3.0,...,yes,1 to 2,LatAm,0.0,no,,Always,1 to 2,134.0,Obesity_Type_III
1608,1609,23.0,Sometimes,yes,up to 5,Sometimes,Female,1.75,,3.0,...,yes,1 to 2,LatAm,0.0,no,Public,Always,1 to 2,134.0,Obesity_Type_III
1609,1610,24.0,Sometimes,yes,up to 5,Sometimes,Female,1.74,,3.0,...,yes,1 to 2,LatAm,0.0,no,Public,Always,more than 2,133.0,Obesity_Type_III


In [119]:
# Drop the 'marrital_status' and 'region' columns from the dataset
obesity_train = obesity_train_raw.drop(columns=['marrital_status', 'region'])
obesity_test = obesity_test_raw.drop(columns=['marrital_status', 'region'])

In [121]:
obesity_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1611 non-null   int64  
 1   age                        1545 non-null   float64
 2   alcohol_freq               1575 non-null   object 
 3   caloric_freq               1591 non-null   object 
 4   devices_perday             1589 non-null   object 
 5   eat_between_meals          1552 non-null   object 
 6   gender                     1591 non-null   object 
 7   height                     1597 non-null   float64
 8   meals_perday               1602 non-null   float64
 9   monitor_calories           1572 non-null   object 
 10  parent_overweight          1591 non-null   object 
 11  physical_activity_perweek  1046 non-null   object 
 12  siblings                   1599 non-null   float64
 13  smoke                      1599 non-null   objec

In [111]:
dummy_columns = ['alcohol_freq','caloric_freq','devices_perday','eat_between_meals','gender',
                 'monitor_calories','parent_overweight','physical_activity_perweek','smoke','transportation',
                 'veggies_freq','water_daily']
numeric_columns = ['age','height','meals_perday','weight','siblings']

In [122]:
obesity_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1611 non-null   int64  
 1   age                        1545 non-null   float64
 2   alcohol_freq               1575 non-null   object 
 3   caloric_freq               1591 non-null   object 
 4   devices_perday             1589 non-null   object 
 5   eat_between_meals          1552 non-null   object 
 6   gender                     1591 non-null   object 
 7   height                     1597 non-null   float64
 8   meals_perday               1602 non-null   float64
 9   monitor_calories           1572 non-null   object 
 10  parent_overweight          1591 non-null   object 
 11  physical_activity_perweek  1046 non-null   object 
 12  siblings                   1599 non-null   float64
 13  smoke                      1599 non-null   objec

In [113]:
def imputer(data_train, data_val, columns, strategy):
    """
    """
    # Impute 
    imputer = SimpleImputer(strategy=strategy)
    for column in columns:
        imputer = imputer.fit(data_train[[column]])
        data_train[[column]] = imputer.transform(data_train[[column]])
        data_val[[column]] = imputer.transform(data_val[[column]])
    
    return data_train, data_val


In [114]:
obesity_train_imp, obesity_test_imp = imputer(obesity_train, obesity_test, dummy_columns, strategy='most_frequent')
obesity_train_imp, obesity_test_imp = imputer(obesity_train, obesity_test, numeric_columns, strategy='median')

In [115]:
obesity_train_imp.info()
obesity_train_imp

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1611 non-null   int64  
 1   age                        1611 non-null   float64
 2   alcohol_freq               1611 non-null   object 
 3   caloric_freq               1611 non-null   object 
 4   devices_perday             1611 non-null   object 
 5   eat_between_meals          1611 non-null   object 
 6   gender                     1611 non-null   object 
 7   height                     1611 non-null   float64
 8   meals_perday               1611 non-null   float64
 9   monitor_calories           1611 non-null   object 
 10  parent_overweight          1611 non-null   object 
 11  physical_activity_perweek  1611 non-null   object 
 12  siblings                   1611 non-null   float64
 13  smoke                      1611 non-null   objec

Unnamed: 0,id,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight,obese_level
0,1,21.0,Never,no,up to 5,Sometimes,Female,1.62,3.0,no,yes,1 to 2,3.0,no,Public,Sometimes,1 to 2,64.0,Normal_Weight
1,2,23.0,Frequently,no,up to 5,Sometimes,Male,1.80,3.0,no,yes,3 to 4,0.0,no,Public,Sometimes,1 to 2,77.0,Normal_Weight
2,3,23.0,Frequently,no,up to 2,Sometimes,Male,1.80,3.0,no,no,3 to 4,2.0,no,Walk,Always,1 to 2,87.0,Overweight_Level_I
3,4,22.0,Sometimes,no,up to 2,Sometimes,Male,1.78,1.0,no,no,1 to 2,3.0,no,Public,Sometimes,1 to 2,90.0,Overweight_Level_II
4,5,22.0,Sometimes,no,up to 2,Sometimes,Male,1.64,3.0,no,no,5 or more,3.0,no,Public,Sometimes,1 to 2,53.0,Normal_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,1607,21.0,Sometimes,yes,up to 5,Sometimes,Female,1.73,3.0,no,yes,3 to 4,1.0,no,Public,Always,1 to 2,131.0,Obesity_Type_III
1607,1608,22.0,Sometimes,yes,up to 5,Sometimes,Female,1.75,3.0,no,yes,1 to 2,0.0,no,Public,Always,1 to 2,134.0,Obesity_Type_III
1608,1609,23.0,Sometimes,yes,up to 5,Sometimes,Female,1.75,3.0,no,yes,1 to 2,0.0,no,Public,Always,1 to 2,134.0,Obesity_Type_III
1609,1610,24.0,Sometimes,yes,up to 5,Sometimes,Female,1.74,3.0,no,yes,1 to 2,0.0,no,Public,Always,more than 2,133.0,Obesity_Type_III


In [123]:
obesity_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1611 non-null   int64  
 1   age                        1545 non-null   float64
 2   alcohol_freq               1575 non-null   object 
 3   caloric_freq               1591 non-null   object 
 4   devices_perday             1589 non-null   object 
 5   eat_between_meals          1552 non-null   object 
 6   gender                     1591 non-null   object 
 7   height                     1597 non-null   float64
 8   meals_perday               1602 non-null   float64
 9   monitor_calories           1572 non-null   object 
 10  parent_overweight          1591 non-null   object 
 11  physical_activity_perweek  1046 non-null   object 
 12  siblings                   1599 non-null   float64
 13  smoke                      1599 non-null   objec

In [135]:
obesity_train.drop(columns=['obese_level'], inplace=True)

In [136]:
obesity_train_label_enc = obesity_train.copy()

for column in dummy_columns:
    le = OrdinalEncoder(encoded_missing_value=np.nan)
    obesity_train_label_enc[[column]] = le.fit_transform(obesity_train_label_enc[[column]])

obesity_train_label_enc

Unnamed: 0,id,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight
0,1,21.0,2.0,0.0,2.0,3.0,0.0,1.62,3.0,0.0,1.0,,3.0,0.0,3.0,2.0,0.0,64.0
1,2,23.0,1.0,0.0,2.0,3.0,1.0,1.80,3.0,0.0,1.0,1.0,0.0,0.0,3.0,2.0,0.0,77.0
2,3,,1.0,0.0,1.0,3.0,1.0,1.80,3.0,0.0,0.0,1.0,2.0,0.0,4.0,0.0,0.0,87.0
3,4,22.0,3.0,0.0,1.0,3.0,1.0,1.78,1.0,0.0,0.0,,3.0,0.0,3.0,2.0,0.0,90.0
4,5,22.0,3.0,0.0,1.0,3.0,1.0,1.64,3.0,0.0,0.0,2.0,3.0,0.0,3.0,2.0,0.0,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,1607,21.0,3.0,,2.0,3.0,0.0,1.73,3.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,131.0
1607,1608,22.0,3.0,1.0,2.0,3.0,0.0,1.75,3.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,134.0
1608,1609,23.0,3.0,1.0,2.0,3.0,0.0,1.75,3.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,134.0
1609,1610,24.0,3.0,1.0,2.0,3.0,0.0,1.74,3.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,2.0,133.0


In [137]:
# Apply KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)
obesity_train_label_enc_imputed = pd.DataFrame(knn_imputer.fit_transform(obesity_train_label_enc), columns=obesity_train_label_enc.columns)

obesity_train_label_enc_imputed

Unnamed: 0,id,age,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,height,meals_perday,monitor_calories,parent_overweight,physical_activity_perweek,siblings,smoke,transportation,veggies_freq,water_daily,weight
0,1.0,21.0,2.0,0.0,2.0,3.0,0.0,1.62,3.0,0.0,1.0,0.8,3.0,0.0,3.0,2.0,0.0,64.0
1,2.0,23.0,1.0,0.0,2.0,3.0,1.0,1.80,3.0,0.0,1.0,1.0,0.0,0.0,3.0,2.0,0.0,77.0
2,3.0,28.0,1.0,0.0,1.0,3.0,1.0,1.80,3.0,0.0,0.0,1.0,2.0,0.0,4.0,0.0,0.0,87.0
3,4.0,22.0,3.0,0.0,1.0,3.0,1.0,1.78,1.0,0.0,0.0,1.0,3.0,0.0,3.0,2.0,0.0,90.0
4,5.0,22.0,3.0,0.0,1.0,3.0,1.0,1.64,3.0,0.0,0.0,2.0,3.0,0.0,3.0,2.0,0.0,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,1607.0,21.0,3.0,1.0,2.0,3.0,0.0,1.73,3.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,131.0
1607,1608.0,22.0,3.0,1.0,2.0,3.0,0.0,1.75,3.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,134.0
1608,1609.0,23.0,3.0,1.0,2.0,3.0,0.0,1.75,3.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,134.0
1609,1610.0,24.0,3.0,1.0,2.0,3.0,0.0,1.74,3.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,2.0,133.0


In [131]:
# Inverse transform to original labels
for column in dummy_columns:
    le = OrdinalEncoder(encoded_missing_value=np.nan)
    obesity_train_label_enc_imputed[[column]] = le.fit(obesity_train[[column]]).inverse_transform(obesity_train_label_enc_imputed[[column]])

In [132]:
obesity_train_label_enc_imputed

Unnamed: 0,alcohol_freq,caloric_freq,devices_perday,eat_between_meals,gender,monitor_calories,parent_overweight,physical_activity_perweek,smoke,transportation,veggies_freq,water_daily
0,Never,no,up to 5,Sometimes,Female,no,yes,1 to 2,no,Public,Sometimes,1 to 2
1,Frequently,no,up to 5,Sometimes,Male,no,yes,3 to 4,no,Public,Sometimes,1 to 2
2,Frequently,no,up to 2,Sometimes,Male,no,no,3 to 4,no,Walk,Always,1 to 2
3,Sometimes,no,up to 2,Sometimes,Male,no,no,1 to 2,no,Public,Sometimes,1 to 2
4,Sometimes,no,up to 2,Sometimes,Male,no,no,5 or more,no,Public,Sometimes,1 to 2
...,...,...,...,...,...,...,...,...,...,...,...,...
1606,Sometimes,yes,up to 5,Sometimes,Female,no,yes,3 to 4,no,Public,Always,1 to 2
1607,Sometimes,yes,up to 5,Sometimes,Female,no,yes,1 to 2,no,Public,Always,1 to 2
1608,Sometimes,yes,up to 5,Sometimes,Female,no,yes,1 to 2,no,Public,Always,1 to 2
1609,Sometimes,yes,up to 5,Sometimes,Female,no,yes,1 to 2,no,Public,Always,more than 2


In [14]:
x=pd.get_dummies(obesity_train[dummy_columns],dummy_na=True, drop_first=True)
print(x.info())
x

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype
---  ------                               --------------  -----
 0   alcohol_freq_Frequently              1611 non-null   bool 
 1   alcohol_freq_Never                   1611 non-null   bool 
 2   alcohol_freq_Sometimes               1611 non-null   bool 
 3   alcohol_freq_nan                     1611 non-null   bool 
 4   caloric_freq_yes                     1611 non-null   bool 
 5   caloric_freq_nan                     1611 non-null   bool 
 6   devices_perday_up to 2               1611 non-null   bool 
 7   devices_perday_up to 5               1611 non-null   bool 
 8   devices_perday_nan                   1611 non-null   bool 
 9   eat_between_meals_Frequently         1611 non-null   bool 
 10  eat_between_meals_Never              1611 non-null   bool 
 11  eat_between_meals_Sometimes          1611 non-null   boo

Unnamed: 0,alcohol_freq_Frequently,alcohol_freq_Never,alcohol_freq_Sometimes,alcohol_freq_nan,caloric_freq_yes,caloric_freq_nan,devices_perday_up to 2,devices_perday_up to 5,devices_perday_nan,eat_between_meals_Frequently,...,transportation_Motorbike,transportation_Public,transportation_Walk,transportation_nan,veggies_freq_Never,veggies_freq_Sometimes,veggies_freq_nan,water_daily_less than 1,water_daily_more than 2,water_daily_nan
0,False,True,False,False,False,False,False,True,False,False,...,False,True,False,False,False,True,False,False,False,False
1,True,False,False,False,False,False,False,True,False,False,...,False,True,False,False,False,True,False,False,False,False
2,True,False,False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,False,False,True,False,False,False,True,False,False,False,...,False,True,False,False,False,True,False,False,False,False
4,False,False,True,False,False,False,True,False,False,False,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606,False,False,True,False,False,True,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
1607,False,False,True,False,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
1608,False,False,True,False,True,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,False
1609,False,False,True,False,True,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,False


In [None]:
knn_imputer = KNNImputer(n_neighbors=5)
columns_to_impute = ['caloric_freq_nan']  # Specify the columns you want to impute
x[columns_to_impute] = knn_imputer.fit_transform(x[columns_to_impute])
x