In [9]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

# Obesity data file
obesity_data_path = "data/ObesityDataSet.csv"

# Read and load the obesity dataset
obesity_df = pd.read_csv(obesity_data_path)
obesity_df.head()

Unnamed: 0,Gender,Age,Height(m),Weight(kg),family_history_with_overweight,FAVC(Frequent Consumption of High Caloric Food),FCVC(Frequency of consumption of Vegetables),NCP( Number of Main Meals),CAEC( Consumption of Food Between Meals),SMOKE,CH2O(Consumption of Water),SCC(Caloric Consumption Monitoring),FAF( Physical Activity Frequency),TUE(Time Using Technology Devices),CALC( Consumption of Alcohol),MTRANS(Transportation Use),Nobeyesdad(Classification of Weight)
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [10]:
# Display basic information about the obesity dataset
#No missing values #finding missing values
#obesity_df.isnull().sum()
obesity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Gender                                            2111 non-null   object 
 1   Age                                               2111 non-null   float64
 2   Height(m)                                         2111 non-null   float64
 3   Weight(kg)                                        2111 non-null   float64
 4   family_history_with_overweight                    2111 non-null   object 
 5   FAVC(Frequent Consumption of  High Caloric Food)  2111 non-null   object 
 6   FCVC(Frequency of consumption of Vegetables)      2111 non-null   float64
 7   NCP( Number of Main Meals)                        2111 non-null   float64
 8   CAEC( Consumption of Food Between Meals)          2111 non-null   object 
 9   SMOKE              

In [20]:
# Generate descriptive statistics
obesity_df.describe().round(2)

Unnamed: 0,Age,Height(m),Weight(kg),FCVC(Frequency of consumption of Vegetables),NCP( Number of Main Meals),CH2O(Consumption of Water),FAF( Physical Activity Frequency),TUE(Time Using Technology Devices)
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.31,1.7,86.59,2.42,2.69,2.01,1.01,0.66
std,6.35,0.09,26.19,0.53,0.78,0.61,0.85,0.61
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.95,1.63,65.47,2.0,2.66,1.58,0.12,0.0
50%,22.78,1.7,83.0,2.39,3.0,2.0,1.0,0.63
75%,26.0,1.77,107.43,3.0,3.0,2.48,1.67,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [25]:
#Categorical Variable Summary 

categorical = obesity_df.dtypes[obesity_df.dtypes == "object"].index
print(categorical)

obesity_df[categorical].describe()

Index(['Gender', 'family_history_with_overweight',
       'FAVC(Frequent Consumption of  High Caloric Food)',
       'CAEC( Consumption of Food Between Meals)', 'SMOKE',
       'SCC(Caloric  Consumption Monitoring)', 'CALC( Consumption of Alcohol)',
       'MTRANS(Transportation Use)', 'Nobeyesdad(Classification of Weight)'],
      dtype='object')


Unnamed: 0,Gender,family_history_with_overweight,FAVC(Frequent Consumption of High Caloric Food),CAEC( Consumption of Food Between Meals),SMOKE,SCC(Caloric Consumption Monitoring),CALC( Consumption of Alcohol),MTRANS(Transportation Use),Nobeyesdad(Classification of Weight)
count,2111,2111,2111,2111,2111,2111,2111,2111,2111
unique,2,2,2,4,2,2,4,5,7
top,Male,yes,yes,Sometimes,no,no,Sometimes,Public_Transportation,Obesity_Type_I
freq,1068,1726,1866,1765,2067,2015,1401,1580,351


In [144]:
#1. Relationship between dietary habits and obesity rates
# Group by dietary habits and calculate value counts normalized by weight classification
dietary_habits_obesity = obesity_df.groupby(['FAVC(Frequent Consumption of  High Caloric Food)', 'FCVC(Frequency of consumption of Vegetables)'])['Nobeyesdad(Classification of Weight)'].value_counts(normalize=True).unstack()

# Calculate count and percentage for each weight classification
summary_stats = pd.DataFrame({
    'Count': dietary_habits_obesity.count(),
    'Percentage': (dietary_habits_obesity.mean() * 100).map('{:.1f}%'.format)
}).sort_values(by='Percentage', ascending=False)

# Apply table formatting and display the styled summary statistics
summary_stats.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center'), ('border', '1px black')]}])


Unnamed: 0_level_0,Count,Percentage
Nobeyesdad(Classification of Weight),Unnamed: 1_level_1,Unnamed: 2_level_1
Overweight_Level_II,97,47.5%
Obesity_Type_II,130,46.6%
Insufficient_Weight,111,43.1%
Normal_Weight,6,40.5%
Obesity_Type_I,97,35.7%
Overweight_Level_I,95,33.4%
Obesity_Type_III,2,29.2%


In [125]:
#2. Lifestyle, particularly leisure time physical activity, and the risk of obesity
# Group by leisure time physical activity and calculate obesity rates
physical_activity_obesity = obesity_df.groupby(['FAF( Physical Activity Frequency)'])['Nobeyesdad(Classification of Weight)'].value_counts(normalize=True).unstack()

# Calculate count and percentage for each weight classification, then format and display
pd.DataFrame({
    'Count': physical_activity_obesity.count(),
    'Percentage': (physical_activity_obesity.mean() * 100).map('{:.1f}%'.format)
}).sort_values(by='Percentage', ascending=False).style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center'), ('border', '1px black')]}])


Unnamed: 0_level_0,Count,Percentage
Nobeyesdad(Classification of Weight),Unnamed: 1_level_1,Unnamed: 2_level_1
Normal_Weight,4,36.7%
Obesity_Type_II,136,36.2%
Overweight_Level_I,123,35.3%
Obesity_Type_I,140,33.7%
Obesity_Type_III,114,33.5%
Insufficient_Weight,117,33.0%
Overweight_Level_II,124,31.4%


In [146]:
#3. Association between various dietary patterns and levels of obesity
# Group by dietary patterns and calculate obesity rates
diet_patterns_obesity = obesity_df.groupby(['FAVC(Frequent Consumption of  High Caloric Food)', 'FCVC(Frequency of consumption of Vegetables)'])['Nobeyesdad(Classification of Weight)'].value_counts(normalize=True).unstack()

# Calculate count and percentage for each weight classification, then format and display
pd.DataFrame({
    'Count': diet_patterns_obesity.count(),
    'Percentage': (diet_patterns_obesity.mean() * 100).map('{:.1f}%'.format)
}).sort_values(by='Percentage', ascending=False).style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'center'), ('border', '1px black')]}])

Unnamed: 0_level_0,Count,Percentage
Nobeyesdad(Classification of Weight),Unnamed: 1_level_1,Unnamed: 2_level_1
Overweight_Level_II,97,47.5%
Obesity_Type_II,130,46.6%
Insufficient_Weight,111,43.1%
Normal_Weight,6,40.5%
Obesity_Type_I,97,35.7%
Overweight_Level_I,95,33.4%
Obesity_Type_III,2,29.2%


In [83]:
print(obesity_df.columns)

Index(['Gender', 'Age', 'Height(m)', 'Weight(kg)',
       'family_history_with_overweight',
       'FAVC(Frequent Consumption of  High Caloric Food)',
       'FCVC(Frequency of consumption of Vegetables)',
       'NCP( Number of Main Meals)',
       'CAEC( Consumption of Food Between Meals)', 'SMOKE',
       'CH2O(Consumption of Water)', 'SCC(Caloric  Consumption Monitoring)',
       'FAF( Physical Activity Frequency)',
       'TUE(Time Using Technology Devices)', 'CALC( Consumption of Alcohol)',
       'MTRANS(Transportation Use)', 'Nobeyesdad(Classification of Weight)'],
      dtype='object')


In [108]:
# Set style options
styler = obesity_df.style \
    .set_table_styles([{
        'props': [('text-align', 'center'), ('border', '1px black')],
        'selector': 'th,td'
    }]) \
    .set_properties(**{'white-space': 'nowrap', 'text-align': 'center'})

# Display styled DataFrame
styler

Unnamed: 0,Gender,Age,Height(m),Weight(kg),family_history_with_overweight,FAVC(Frequent Consumption of High Caloric Food),FCVC(Frequency of consumption of Vegetables),NCP( Number of Main Meals),CAEC( Consumption of Food Between Meals),SMOKE,CH2O(Consumption of Water),SCC(Caloric Consumption Monitoring),FAF( Physical Activity Frequency),TUE(Time Using Technology Devices),CALC( Consumption of Alcohol),MTRANS(Transportation Use),Nobeyesdad(Classification of Weight)
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight
