# Obesity Dataset Preprocessing Notebook
This notebook handles:
- Splitting Train and Test Data
- Scaling Numerical Features
- Encoding Categorical Features

In [1]:
import pandas as pd

df = pd.read_csv("data/Obesity Dataset - Engineered.csv")
df.head()

Unnamed: 0,Gender,Age,Height,Weight,Overweight_Family_History,High_Calorie_Consumption,Vegetable_Consumption,Main_Meals,Snack_Consumption,Smoker,Water_Intake,Calorie_Monitoring,Physical_Activity,Tech_Time,Alcohol_Consumption,Transportation_Mean,Obesity_Level,BMI,BMR
0,female,21,1.62,64.0,yes,no,2.0,3.0,sometimes,no,2.0,no,0.0,1.0,no,public_transportation,normal_weight,24.386526,1386.5
1,female,21,1.52,56.0,yes,no,3.0,3.0,sometimes,yes,3.0,yes,3.0,0.0,sometimes,public_transportation,normal_weight,24.238227,1244.0
2,male,23,1.8,77.0,yes,no,2.0,3.0,sometimes,no,2.0,no,2.0,1.0,frequently,public_transportation,normal_weight,23.765432,1785.0
3,male,27,1.8,87.0,no,no,3.0,3.0,sometimes,no,2.0,no,2.0,0.0,frequently,walking,overweight_level_i,26.851852,1865.0
4,male,22,1.78,89.8,no,no,2.0,1.0,sometimes,no,2.0,no,0.0,0.0,sometimes,public_transportation,overweight_level_ii,28.342381,1905.5


## Train Test Splitting

In [2]:
from sklearn.model_selection import train_test_split

target = "Obesity_Level"
X = df.drop(target, axis=1)
y = df[target]
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((1251, 18), (418, 18), (418, 18), (1251,), (418,), (418,))

## X Preprocessor Scaling and Encoding

In [3]:
numerical_columns = df.select_dtypes(exclude=["object"]).columns.tolist()
categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
categorical_columns.remove(target)
numerical_columns, categorical_columns

(['Age',
  'Height',
  'Weight',
  'Vegetable_Consumption',
  'Main_Meals',
  'Water_Intake',
  'Physical_Activity',
  'Tech_Time',
  'BMI',
  'BMR'],
 ['Gender',
  'Overweight_Family_History',
  'High_Calorie_Consumption',
  'Snack_Consumption',
  'Smoker',
  'Calorie_Monitoring',
  'Alcohol_Consumption',
  'Transportation_Mean'])

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(transformers=[
    ("Scale", StandardScaler(), numerical_columns),
    ("Encode", OneHotEncoder(), categorical_columns),
])
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)
X_train.shape, X_test.shape

((1251, 33), (418, 33))

In [5]:
import joblib

joblib.dump(preprocessor, "preprocessors/X_preprocessor.pkl")

['preprocessors/X_preprocessor.pkl']

## y Encoding

In [6]:
df[target].unique()

array(['normal_weight', 'overweight_level_i', 'overweight_level_ii',
       'obesity_type_i', 'insufficient_weight', 'obesity_type_ii',
       'obesity_type_iii'], dtype=object)

In [7]:
target_order = ["insufficient_weight", "normal_weight", "overweight_level_i", "overweight_level_ii", "obesity_type_i",
                "obesity_type_ii", "obesity_type_iii"]
from sklearn.preprocessing import OrdinalEncoder

y_encoder = OrdinalEncoder(categories=[target_order])
y_train = y_encoder.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_val = y_encoder.transform(y_val.values.reshape(-1, 1)).ravel()
y_test = y_encoder.transform(y_test.values.reshape(-1, 1)).ravel()
y_train.shape, y_test.shape

((1251,), (418,))

In [8]:
joblib.dump(y_encoder, "preprocessors/y_encoder.pkl")

['preprocessors/y_encoder.pkl']

## Train and Test Data Saving

In [9]:
encoded_feature_names = preprocessor.transformers_[1][1].get_feature_names_out(categorical_columns)
column_names = list(numerical_columns) + list(encoded_feature_names)
column_names

['Age',
 'Height',
 'Weight',
 'Vegetable_Consumption',
 'Main_Meals',
 'Water_Intake',
 'Physical_Activity',
 'Tech_Time',
 'BMI',
 'BMR',
 'Gender_female',
 'Gender_male',
 'Overweight_Family_History_no',
 'Overweight_Family_History_yes',
 'High_Calorie_Consumption_no',
 'High_Calorie_Consumption_yes',
 'Snack_Consumption_always',
 'Snack_Consumption_frequently',
 'Snack_Consumption_no',
 'Snack_Consumption_sometimes',
 'Smoker_no',
 'Smoker_yes',
 'Calorie_Monitoring_no',
 'Calorie_Monitoring_yes',
 'Alcohol_Consumption_always',
 'Alcohol_Consumption_frequently',
 'Alcohol_Consumption_no',
 'Alcohol_Consumption_sometimes',
 'Transportation_Mean_automobile',
 'Transportation_Mean_bike',
 'Transportation_Mean_motorbike',
 'Transportation_Mean_public_transportation',
 'Transportation_Mean_walking']

In [10]:
X_train_df = pd.DataFrame(X_train, columns=column_names)
X_val_df = pd.DataFrame(X_val, columns=column_names)
X_test_df = pd.DataFrame(X_test, columns=column_names)
y_train_df = pd.DataFrame(y_train)
y_val_df = pd.DataFrame(y_val)
y_test_df = pd.DataFrame(y_test)

In [11]:
X_train_df.to_csv("data/preprocessed data/X_train.csv", index=False)
X_val_df.to_csv("data/preprocessed data/X_val.csv", index=False)
X_test_df.to_csv("data/preprocessed data/X_test.csv", index=False)
y_train_df.to_csv("data/preprocessed data/y_train.csv", header=False, index=False)
y_val_df.to_csv("data/preprocessed data/y_val.csv", header=False, index=False)
y_test_df.to_csv("data/preprocessed data/y_test.csv", header=False, index=False)