In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder

In [2]:
train_data = pd.read_csv("./train.csv")
train_data.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [76]:
X_test = pd.read_csv('./test.csv')
X_test.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS'],
      dtype='object')

Frequent consumption of high caloric food (FAVC)<br>
Frequency of consumption of vegetables(FCVC)<br>
Number of main meals (NCP)<br>
Consumption of food between meals (CAEC)<br>
Consumption of water daily (CH2O)<br>
Calories consumption monitoring (SCC)<br>
Physical activity frequency (FAF)<br>
Time using technology devices (TUE)<br>
Consumption of alcohol (CALC)<br>
Transportation used (MTRANS)<br>

In [7]:
X = train_data.iloc[:,0:-1]
X.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation


In [12]:
y = train_data.iloc[:,-1]
y.unique()


array(['Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight',
       'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I',
       'Obesity_Type_I'], dtype=object)

In [14]:
#Adding a BMI feature 
X['BMI'] = X.Weight/(X.Height*X.Height)
X.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,28.259565
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,23.422091
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,17.126706
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,44.855798
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,25.599151


In [78]:
X_test['BMI'] = X_test.Weight/(X_test.Height*X_test.Height)
X_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,20758,Male,26.899886,1.848294,120.644178,yes,yes,2.938616,3.0,Sometimes,no,2.825629,no,0.8554,0.0,Sometimes,Public_Transportation,35.315411
1,20759,Female,21.0,1.6,66.0,yes,yes,2.0,1.0,Sometimes,no,3.0,no,1.0,0.0,Sometimes,Public_Transportation,25.78125
2,20760,Female,26.0,1.643355,111.600553,yes,yes,3.0,3.0,Sometimes,no,2.621877,no,0.0,0.250502,Sometimes,Public_Transportation,41.324115
3,20761,Male,20.979254,1.553127,103.669116,yes,yes,2.0,2.977909,Sometimes,no,2.786417,no,0.094851,0.0,Sometimes,Public_Transportation,42.976937
4,20762,Female,26.0,1.627396,104.835346,yes,yes,3.0,3.0,Sometimes,no,2.653531,no,0.0,0.741069,Sometimes,Public_Transportation,39.584143


In [61]:
order = ['Insufficient_Weight', 'Normal_Weight',  'Overweight_Level_I',
        'Overweight_Level_II','Obesity_Type_I',
        'Obesity_Type_II', 'Obesity_Type_III']
y_ord = OrdinalEncoder(categories=[order])
y_encoded = y_ord.fit_transform(train_data[['NObeyesdad']])
y_encoded = pd.Series(y_encoded.reshape(-1),name = 'NObeyesdad')
y_encoded

0        3.0
1        1.0
2        0.0
3        6.0
4        3.0
        ... 
20753    5.0
20754    0.0
20755    5.0
20756    3.0
20757    5.0
Name: NObeyesdad, Length: 20758, dtype: float64

In [59]:
y_ord.categories_

[array(['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I',
        'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I',
        'Overweight_Level_II'], dtype=object)]

In [82]:
#Ordinal Encoding 
cols = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
ord = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value= -1)
X_encoded = X.copy()
X_encoded[cols] = ord.fit_transform(X_encoded[cols])
X_encoded.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,0,1.0,24.443011,1.699998,81.66995,1.0,1.0,2.0,2.983297,2.0,0.0,2.763573,0.0,0.0,0.976473,1.0,3.0,28.259565
1,1,0.0,18.0,1.56,57.0,1.0,1.0,2.0,3.0,1.0,0.0,2.0,0.0,1.0,1.0,2.0,0.0,23.422091
2,2,0.0,18.0,1.71146,50.165754,1.0,1.0,1.880534,1.411685,2.0,0.0,1.910378,0.0,0.866045,1.673584,2.0,3.0,17.126706
3,3,0.0,20.952737,1.71073,131.274851,1.0,1.0,3.0,3.0,2.0,0.0,1.674061,0.0,1.467863,0.780199,1.0,3.0,44.855798
4,4,1.0,31.641081,1.914186,93.798055,1.0,1.0,2.679664,1.971472,2.0,0.0,1.979848,0.0,1.967973,0.931721,1.0,3.0,25.599151


In [83]:
X_test[cols] = ord.transform(X_test[cols])
X_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,20758,1.0,26.899886,1.848294,120.644178,1.0,1.0,2.938616,3.0,2.0,0.0,2.825629,0.0,0.8554,0.0,1.0,3.0,35.315411
1,20759,0.0,21.0,1.6,66.0,1.0,1.0,2.0,1.0,2.0,0.0,3.0,0.0,1.0,0.0,1.0,3.0,25.78125
2,20760,0.0,26.0,1.643355,111.600553,1.0,1.0,3.0,3.0,2.0,0.0,2.621877,0.0,0.0,0.250502,1.0,3.0,41.324115
3,20761,1.0,20.979254,1.553127,103.669116,1.0,1.0,2.0,2.977909,2.0,0.0,2.786417,0.0,0.094851,0.0,1.0,3.0,42.976937
4,20762,0.0,26.0,1.627396,104.835346,1.0,1.0,3.0,3.0,2.0,0.0,2.653531,0.0,0.0,0.741069,1.0,3.0,39.584143


In [41]:
#One Hot Encoding 
X.CAEC.unique()

array(['Sometimes', 'Frequently', 'no', 'Always'], dtype=object)

In [84]:
X_encoded['Negatives'] = X_encoded.TUE+X_encoded.CALC+X_encoded.SCC
X_encoded.Negatives.corr(y_encoded)

-0.24033386400473164

In [85]:
X_test['Negatives'] = X_test.TUE+X_test.CALC+X_test.SCC

In [69]:
X_encoded.SCC.corr(y_encoded)

-0.18495933690707334

In [72]:
X_encoded.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI,Negatives
0,0,1.0,24.443011,1.699998,81.66995,1.0,1.0,2.0,2.983297,2.0,0.0,2.763573,0.0,0.0,0.976473,1.0,3.0,28.259565,1.976473
1,1,0.0,18.0,1.56,57.0,1.0,1.0,2.0,3.0,1.0,0.0,2.0,0.0,1.0,1.0,2.0,0.0,23.422091,3.0
2,2,0.0,18.0,1.71146,50.165754,1.0,1.0,1.880534,1.411685,2.0,0.0,1.910378,0.0,0.866045,1.673584,2.0,3.0,17.126706,3.673584
3,3,0.0,20.952737,1.71073,131.274851,1.0,1.0,3.0,3.0,2.0,0.0,1.674061,0.0,1.467863,0.780199,1.0,3.0,44.855798,1.780199
4,4,1.0,31.641081,1.914186,93.798055,1.0,1.0,2.679664,1.971472,2.0,0.0,1.979848,0.0,1.967973,0.931721,1.0,3.0,25.599151,1.931721


In [74]:
X_encoded.FAF.max(),X_encoded.FAF.min()

(3.0, 0.0)

In [86]:
X_test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI,Negatives
0,20758,1.0,26.899886,1.848294,120.644178,1.0,1.0,2.938616,3.0,2.0,0.0,2.825629,0.0,0.8554,0.0,1.0,3.0,35.315411,1.0
1,20759,0.0,21.0,1.6,66.0,1.0,1.0,2.0,1.0,2.0,0.0,3.0,0.0,1.0,0.0,1.0,3.0,25.78125,1.0
2,20760,0.0,26.0,1.643355,111.600553,1.0,1.0,3.0,3.0,2.0,0.0,2.621877,0.0,0.0,0.250502,1.0,3.0,41.324115,1.250502
3,20761,1.0,20.979254,1.553127,103.669116,1.0,1.0,2.0,2.977909,2.0,0.0,2.786417,0.0,0.094851,0.0,1.0,3.0,42.976937,1.0
4,20762,0.0,26.0,1.627396,104.835346,1.0,1.0,3.0,3.0,2.0,0.0,2.653531,0.0,0.0,0.741069,1.0,3.0,39.584143,1.741069


In [87]:
X_encoded.to_csv('X_train.csv',index= False)
X_test.to_csv('X_test.csv',index = False)