# Multi-Class Prediction of Obesity Risk

# `01` Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# `02` Load Data

Note: Make sure the `Data` folder attached with the notebook is in the same path for this cell to work properly.

[Dataset Link](https://www.kaggle.com/competitions/playground-series-s4e2/data)

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

----------------

# `03` Exploratory Data Analysis (EDA)

In [5]:
df_train = train.copy()

In [6]:
df_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [11]:
df_train.shape

(20758, 18)

In [12]:
df_train.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [7]:
df_train.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [13]:
df_train.select_dtypes('number').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      20758 non-null  int64  
 1   Age     20758 non-null  float64
 2   Height  20758 non-null  float64
 3   Weight  20758 non-null  float64
 4   FCVC    20758 non-null  float64
 5   NCP     20758 non-null  float64
 6   CH2O    20758 non-null  float64
 7   FAF     20758 non-null  float64
 8   TUE     20758 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 1.4 MB


In [8]:
df_train.select_dtypes('O').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Gender                          20758 non-null  object
 1   family_history_with_overweight  20758 non-null  object
 2   FAVC                            20758 non-null  object
 3   CAEC                            20758 non-null  object
 4   SMOKE                           20758 non-null  object
 5   SCC                             20758 non-null  object
 6   CALC                            20758 non-null  object
 7   MTRANS                          20758 non-null  object
 8   NObeyesdad                      20758 non-null  object
dtypes: object(9)
memory usage: 1.4+ MB


In [9]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20758.0,10378.5,5992.46278,0.0,5189.25,10378.5,15567.75,20757.0
Age,20758.0,23.841804,5.688072,14.0,20.0,22.815416,26.0,61.0
Height,20758.0,1.700245,0.087312,1.45,1.631856,1.7,1.762887,1.975663
Weight,20758.0,87.887768,26.379443,39.0,66.0,84.064875,111.600553,165.057269
FCVC,20758.0,2.445908,0.533218,1.0,2.0,2.393837,3.0,3.0
NCP,20758.0,2.761332,0.705375,1.0,3.0,3.0,3.0,4.0
CH2O,20758.0,2.029418,0.608467,1.0,1.792022,2.0,2.549617,3.0
FAF,20758.0,0.981747,0.838302,0.0,0.008013,1.0,1.587406,3.0
TUE,20758.0,0.616756,0.602113,0.0,0.0,0.573887,1.0,2.0


In [10]:
df_train.isna().sum().sort_values()

id                                0
CALC                              0
TUE                               0
FAF                               0
SCC                               0
CH2O                              0
SMOKE                             0
CAEC                              0
NCP                               0
FCVC                              0
FAVC                              0
family_history_with_overweight    0
Weight                            0
Height                            0
Age                               0
Gender                            0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [22]:
df_train.duplicated().sum()

0

In [23]:
df_train.nunique()

id                                20758
Gender                                2
Age                                1703
Height                             1833
Weight                             1979
family_history_with_overweight        2
FAVC                                  2
FCVC                                934
NCP                                 689
CAEC                                  4
SMOKE                                 2
CH2O                               1506
SCC                                   2
FAF                                1360
TUE                                1297
CALC                                  3
MTRANS                                5
NObeyesdad                            7
dtype: int64

In [14]:
num_col = df_train.describe(include='number').columns
num_col

Index(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')

In [None]:
for col in num_col:
    print(f'Column Name: {col}')
    print('**'*18)
    print("Max: ", df_train[col].max())
    print("Min: ", df_train[col].min())
    # Display the value counts for each unique value in the current column
    # dropna=False includes NaN values in the count
    print(df_train[col].value_counts(dropna=False))
    print('END','--'*16, '\n')

In [17]:
for col in num_col:
    print(f'Column Name: {col}')
    print('**'*18)
    print("Max: ", df_train[col].max())
    print("Min: ", df_train[col].min())
    print(df_train[col].value_counts(normalize=True, ascending=True))    # Display the value counts for each unique value in the current column
    print('END', '--'*18, '\n')

Column Name: id
************************************
Max:  20757
Min:  0
0        0.000048
13843    0.000048
13842    0.000048
13841    0.000048
13840    0.000048
           ...   
6915     0.000048
6914     0.000048
6913     0.000048
6931     0.000048
20757    0.000048
Name: id, Length: 20758, dtype: float64
END ------------------------------------ 

Column Name: Age
************************************
Max:  61.0
Min:  14.0
24.754302    0.000048
18.137495    0.000048
29.725222    0.000048
23.165574    0.000048
18.137820    0.000048
               ...   
19.000000    0.042682
23.000000    0.057905
21.000000    0.079150
26.000000    0.087003
18.000000    0.092302
Name: Age, Length: 1703, dtype: float64
END ------------------------------------ 

Column Name: Height
************************************
Max:  1.975663
Min:  1.45
1.536819    0.000048
1.725625    0.000048
1.817271    0.000048
1.695469    0.000048
1.675733    0.000048
              ...   
1.800000    0.024906
1.750000    0.0

In [18]:
object_col = df_train.describe(include='O').columns
object_col

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [19]:
#describe for categorical data 
df_train.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Gender,20758,2,Female,10422
family_history_with_overweight,20758,2,yes,17014
FAVC,20758,2,yes,18982
CAEC,20758,4,Sometimes,17529
SMOKE,20758,2,no,20513
SCC,20758,2,no,20071
CALC,20758,3,Sometimes,15066
MTRANS,20758,5,Public_Transportation,16687
NObeyesdad,20758,7,Obesity_Type_III,4046


In [21]:
for col in object_col:
    print(f'Column Name: {col}')
    print('**'*18)
    print(df_train[col].value_counts(normalize=True, ascending=True))  # Display the value counts for each unique value in the current column
    print('END','--'*18, '\n')

Column Name: Gender
************************************
Male      0.497929
Female    0.502071
Name: Gender, dtype: float64
END ------------------------------------ 

Column Name: family_history_with_overweight
************************************
no     0.180364
yes    0.819636
Name: family_history_with_overweight, dtype: float64
END ------------------------------------ 

Column Name: FAVC
************************************
no     0.085557
yes    0.914443
Name: FAVC, dtype: float64
END ------------------------------------ 

Column Name: CAEC
************************************
no            0.013441
Always        0.023027
Frequently    0.119087
Sometimes     0.844446
Name: CAEC, dtype: float64
END ------------------------------------ 

Column Name: SMOKE
************************************
yes    0.011803
no     0.988197
Name: SMOKE, dtype: float64
END ------------------------------------ 

Column Name: SCC
************************************
yes    0.033096
no     0.966904
Name: