In [1]:
# Step 1: Import necessary libraries and load the dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
df = pd.read_csv(r'/Users/hibaali/Desktop/Coding.html/python/ObesityDataSet_raw_and_data_sinthetic.csv')

In [3]:
# Step 2: Inspect the dataset
print("First few rows of the dataset:")
print(df.head())

# Print the number of rows and columns before outlier detection
print(f"\nNumber of rows and columns before outlier detection: {df.shape}")

First few rows of the dataset:
   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_Weight  
2

In [4]:
# Step 3: Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64


In [5]:
# Step 4: Identify and print Outliers using IQR method
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

outliers_dict = {}

for col in numeric_columns:
    outliers_col = df[(df[col] < (Q1[col] - 1.5 * IQR[col])) | (df[col] > (Q3[col] + 1.5 * IQR[col]))]
    outliers_dict[col] = outliers_col
    print(f"\nOutliers in column '{col}':")
    print(outliers_col)


Outliers in column 'Age':
      Gender        Age    Height      Weight family_history_with_overweight  \
13      Male  41.000000  1.800000   99.000000                             no   
21    Female  52.000000  1.690000   87.000000                            yes   
33      Male  39.000000  1.790000   90.000000                             no   
92      Male  55.000000  1.780000   84.000000                            yes   
104   Female  38.000000  1.560000   80.000000                            yes   
...      ...        ...       ...         ...                            ...   
1740    Male  37.765356  1.763582  117.861590                            yes   
1775    Male  37.207082  1.762921  118.401740                            yes   
1776    Male  38.108940  1.752863  119.201465                            yes   
1777    Male  38.644441  1.768235  117.792268                            yes   
1778    Male  38.112989  1.766888  118.134898                            yes   

     FAVC   

In [6]:
# Step 5: Handle Outliers by removing them
df_no_outliers = df.copy()
for col in numeric_columns:
    df_no_outliers = df_no_outliers[~((df_no_outliers[col] < (Q1[col] - 1.5 * IQR[col])) | (df_no_outliers[col] > (Q3[col] + 1.5 * IQR[col])))]\

# Print the number of rows and columns after outlier detection
print(f"\nNumber of rows and columns after outlier detection: {df_no_outliers.shape}")


Number of rows and columns after outlier detection: (1409, 17)


In [7]:
# Step 6: Data Normalization/Standardization
scaler = StandardScaler()
df_no_outliers[numeric_columns] = scaler.fit_transform(df_no_outliers[numeric_columns])

In [8]:
# Step 7: Handle Categorical Features
categorical_columns = df_no_outliers.select_dtypes(include=['object']).columns

for col in categorical_columns:
        le = LabelEncoder()
        df_no_outliers[col] = le.fit_transform(df_no_outliers[col])

# Verify column names before splitting
print("\nColumn names after preprocessing:")
print(df_no_outliers.columns)



Column names after preprocessing:
Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')


In [9]:
df_no_outliers.to_csv(r'/Users/hibaali/Desktop/Coding.html/python/obesity_pre_processed_data.csv', index=False)

In [10]:
read_preprocssed_data = pd.read_csv(r'/Users/hibaali/Desktop/Coding.html/python/obesity_pre_processed_data.csv')

In [11]:
read_preprocssed_data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,-0.517665,-1.088317,-1.016339,1,0,-0.823906,0.211595,2,0,-0.074968,0,-1.217677,0.508987,2,3,1
1,0,-0.517665,-2.235215,-1.308625,1,0,0.997767,0.211595,2,1,1.574274,1,2.331641,-1.199517,1,3,1
2,1,-0.03857,0.9761,-0.541374,1,0,-0.823906,0.211595,2,0,-0.074968,0,1.148535,0.508987,0,3,1
3,1,0.91962,0.9761,-0.176016,0,0,0.997767,0.211595,2,0,-0.074968,0,1.148535,-1.199517,0,4,5
4,1,1.398715,-1.088317,-1.418232,0,1,-0.823906,0.211595,2,0,-0.074968,0,-1.217677,-1.199517,1,0,1
