In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score # for classification of correctly labelled classes over all classes

In [3]:
# continuous variables: Age, Height, Weight, NCP, CH2O, FAF, TUE
# categorical variables: Gender, CAEC, CALC, MTRANS

In [4]:
data = pd.read_csv("data/Obesity_level_prediction_dataset.csv")

In [5]:
data.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
985,Male,22.591439,1.65,80.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,0.451078,2.0,no,Public_Transportation,Overweight_Level_II
987,Male,38.825189,1.780846,85.687751,yes,yes,2.901924,1.124977,Sometimes,no,2.76382,no,0.8554,0.999183,Sometimes,Automobile,Overweight_Level_II
675,Female,18.988581,1.544263,41.535047,no,yes,2.68601,1.0,Sometimes,no,1.310074,no,0.0,1.0647,Sometimes,Public_Transportation,Insufficient_Weight
1396,Female,21.001458,1.65,88.026943,yes,yes,2.482575,1.001542,Sometimes,no,3.0,no,1.751656,1.0,no,Public_Transportation,Obesity_Type_I
1650,Male,31.36347,1.869323,127.507411,yes,yes,2.939727,3.0,Sometimes,yes,1.344122,no,0.923428,1.432336,Sometimes,Public_Transportation,Obesity_Type_II


In [None]:
# datapreprocessing pipeline

In [7]:
def obesity_risk_pipeline(data):
    
    # pull out numerical features here
    continuous_columns = data.select_dtypes(include=['float64']).columns.tolist()

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(data[continuous_columns])

    # Converting to a DataFrame
    scaled_df = pd.DataFrame(scaled_features, columns=scaler.get_feature_names_out(continuous_columns))

    # Combining with the original dataset
    scaled_data = pd.concat([data.drop(columns=continuous_columns), scaled_df], axis=1)
    
    # pull out categorical columns here
    categorical_columns = scaled_data.select_dtypes(include=['object']).columns.tolist()
    categorical_columns.remove('NObeyesdad')  # Exclude target column

    # Applying one-hot encoding
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_features = encoder.fit_transform(scaled_data[categorical_columns])

    # Converting to a DataFrame
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

    # Combining with the original dataset
    prepped_data = pd.concat([scaled_data.drop(columns=categorical_columns), encoded_df], axis=1)
    
    
    # Encoding the target variable
    prepped_data['NObeyesdad'] = prepped_data['NObeyesdad'].astype('category').cat.codes
    prepped_data.head()

    
    return prepped_data 


In [8]:
df = obesity_risk_pipeline(data)

In [9]:
# Preparing final dataset
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

In [10]:
X.sample(5)

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,family_history_with_overweight_yes,...,CAEC_no,SMOKE_yes,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
1793,1.009157,-0.55196,0.555862,0.926206,-0.534585,-1.644905,1.034394,-0.45837,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
107,-0.364507,1.911636,0.130378,-0.785019,0.404153,-1.644905,-1.188039,0.561997,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1890,0.205632,-0.789625,0.827951,1.088342,0.404153,-0.631048,-1.081127,-0.360802,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1823,0.264685,-1.294016,0.588785,1.088342,0.404153,-1.639091,-1.178482,0.561997,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1669,0.129801,0.669606,1.012171,-2.342051,0.404153,0.008475,0.277432,-1.078013,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [11]:
y.sample(5)

826     5
490     6
1699    3
449     1
1415    2
Name: NObeyesdad, dtype: int8

In [12]:
# Experiment with different test sizes in the train_test_split method

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [32]:
# Training logistic regression model using One-vs-All (default)
model_ova =  OneVsRestClassifier(LogisticRegression(max_iter=1000))
model_ova.fit(X_train, y_train)

In [33]:
# Predictions
y_pred_ova = model_ova.predict(X_test)

# Evaluation metrics for OvA
print("One-vs-All (OvA) Strategy")
print(f"Accuracy: {np.round(100*accuracy_score(y_test, y_pred_ova),2)}%")

One-vs-All (OvA) Strategy
Accuracy: 76.12%


In [34]:
# Training logistic regression model using One-vs-One
model_ovo = OneVsOneClassifier(LogisticRegression(max_iter=1000))
model_ovo.fit(X_train, y_train)

In [35]:
# Predictions
y_pred_ovo = model_ovo.predict(X_test)

# Evaluation metrics for OvO
print("One-vs-One (OvO) Strategy")
print(f"Accuracy: {np.round(100*accuracy_score(y_test, y_pred_ovo),2)}%")

One-vs-One (OvO) Strategy
Accuracy: 92.2%


In [None]:
# train test split 0.1

In [37]:
# Training logistic regression model using One-vs-All (default)
model_ova = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model_ova.fit(X_train, y_train)

In [38]:
# Predictions
y_pred_ova = model_ova.predict(X_test)

# Evaluation metrics for OvA
print("One-vs-All (OvA) Strategy")
print(f"Accuracy: {np.round(100*accuracy_score(y_test, y_pred_ova),2)}%")

One-vs-All (OvA) Strategy
Accuracy: 75.94%


In [39]:
# Training logistic regression model using One-vs-One
model_ovo = OneVsOneClassifier(LogisticRegression(max_iter=1000))
model_ovo.fit(X_train, y_train)

In [40]:
# Predictions
y_pred_ovo = model_ovo.predict(X_test)

# Evaluation metrics for OvO
print("One-vs-One (OvO) Strategy")
print(f"Accuracy: {np.round(100*accuracy_score(y_test, y_pred_ovo),2)}%")

One-vs-One (OvO) Strategy
Accuracy: 90.57%


In [46]:
# Training logistic regression model using One-vs-All (default)
model_ova = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model_ova.fit(X_train, y_train)

In [47]:
# Predictions
y_pred_ova = model_ova.predict(X_test)

# Evaluation metrics for OvA
print("One-vs-All (OvA) Strategy")
print(f"Accuracy: {np.round(100*accuracy_score(y_test, y_pred_ova),2)}%")

One-vs-All (OvA) Strategy
Accuracy: 74.92%


In [48]:
# Training logistic regression model using One-vs-One
model_ovo = OneVsOneClassifier(LogisticRegression(max_iter=1000))
model_ovo.fit(X_train, y_train)

In [49]:
# Predictions
y_pred_ovo = model_ovo.predict(X_test)

# Evaluation metrics for OvO
print("One-vs-One (OvO) Strategy")
print(f"Accuracy: {np.round(100*accuracy_score(y_test, y_pred_ovo),2)}%")

One-vs-One (OvO) Strategy
Accuracy: 90.85%
