# Obesity Levels Dataset insights and obesity prediciton

In [None]:
import math
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils.multiclass import unique_labels
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, confusion_matrix

In [None]:
SEED = 42
PROGRESS_BAR_LENGTH = 100

DATASET_FILENAME = Path("data") / "ObesityDataSet_raw_and_data_sinthetic.csv"
TRAIN_DATASET_FILENAME = Path("data") / "preprocessed" / "obesity_train_dataset.csv"
TEST_DATASET_FILENAME = Path("data") / "preprocessed" / "obesity_test_dataset.csv"

CATEGORICAL_COLUMNS = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS", "Age_Category"]

# Data analysis and preprocessing

## Load dataset

In [None]:
# Read CSV file
full_dataset = pd.read_csv(DATASET_FILENAME) #TODO: If it does not exist, display error asking user to download dataset and link to DL
print(f"Dataset shape: {full_dataset.shape}")
full_dataset.head()

In [None]:
full_dataset.describe()

## Preprocess data

In [None]:
# Drop duplicates
full_dataset.drop_duplicates(inplace=True)

## Feature engineering

In [None]:
full_dataset["BMI"] = full_dataset["Weight"] / (full_dataset["Height"])**2
full_dataset["Age_Category"] = pd.cut(full_dataset["Age"], bins=[0, 18, 60, float("inf")], labels=["Young", "Adult", "Elderly"])
full_dataset["Water_Intake_Per_Kg"] = full_dataset["CH2O"] / full_dataset["Weight"]

## Data insights

### Full data distributions

In [None]:
# Display full dataset distribution
plotsize = math.ceil(math.sqrt(full_dataset.shape[1]))
plt.figure(layout="compressed", figsize=(12, 12))

for i, column in enumerate(full_dataset):
    ax = plt.subplot(plotsize, plotsize, i + 1)
    sns.histplot(full_dataset[column], kde=True, color="skyblue", ls="-", lw=1, edgecolor="gray", ax=ax)
    plt.xticks(rotation=90)
plt.show()

### Data distributions separated by gender

In [None]:
# Display full dataset distribution separating by gender
male_dataset = full_dataset[full_dataset["Gender"] == "Male"]
female_dataset = full_dataset[full_dataset["Gender"] == "Female"]

plt.figure(layout="compressed", figsize=(12, 12))

for i, column in enumerate(full_dataset):
    ax = plt.subplot(plotsize, plotsize, i + 1)
    sns.histplot(male_dataset[column], kde=True, color="yellow", alpha=0.6, ls="-", lw=1, edgecolor="gray", ax=ax)
    sns.histplot(female_dataset[column], kde=True, color="skyblue", alpha=0.6, ls="-", lw=1, edgecolor="gray", ax=ax)
    plt.xticks(rotation=90)
plt.show()

### Data distributions of categorical features

In [None]:
def display_categorical_distribution(df, column):
    value_counts = df[column].value_counts()

    plt.figure(layout="compressed", figsize=(6, 6))

    ax = plt.subplot(1, 2, 1)
    plt.pie(value_counts, autopct='%0.001f%%', pctdistance=0.85, colors=["skyblue", "yellow"])
    centre_circle = plt.Circle((0, 0), 0.70, fc='white')
    ax.add_artist(centre_circle)

    ax = plt.subplot(1, 2, 2)
    sns.barplot(x=value_counts.index, y=value_counts.values, hue=value_counts.index, palette=["skyblue", "yellow"], ax=ax)
    plt.xticks(rotation=90)

    plt.title(column)
    plt.show()

In [None]:
for col in CATEGORICAL_COLUMNS:
    display_categorical_distribution(full_dataset, col)
display_categorical_distribution(full_dataset, "NObeyesdad")

### Correlation analysis

In [None]:
correlation_matrix = full_dataset.select_dtypes(include='number').corr()

plt.figure(layout="compressed", figsize=(12, 12))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
plt.title('Correlation matrix')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

## Split and save train and test datasets

In [None]:
# Split dataset
train_dataset, test_dataset = train_test_split(full_dataset, test_size=0.1, random_state=SEED)

TRAIN_DATASET_FILENAME.parent.mkdir(parents=True, exist_ok=True)
TEST_DATASET_FILENAME.parent.mkdir(parents=True, exist_ok=True)

train_dataset.to_feather(TRAIN_DATASET_FILENAME)
test_dataset.to_feather(TEST_DATASET_FILENAME)

# Model training and validation

## Load train and test datasets

In [None]:
train_dataset = pd.read_feather(TRAIN_DATASET_FILENAME)
test_dataset = pd.read_feather(TEST_DATASET_FILENAME)

## Data normalization and encoding

In [None]:
def scale_dataset(df):
    scaler = StandardScaler()

    columns_to_scale = [col for col in df.columns if df[col].dtype == 'float']

    scaled_df = df.copy()
    for col in columns_to_scale:
        scaled_df[col] = scaler.fit_transform(scaled_df[[col]])

    return scaled_df

In [None]:
def encode_dataset(df, categorical_columns):
    encoded_df = df.copy()

    dummy_cols = pd.get_dummies(encoded_df[categorical_columns], prefix=categorical_columns)
    encoded_df = pd.concat([encoded_df, dummy_cols], axis=1)
    encoded_df = encoded_df.drop(categorical_columns, axis=1)

    return encoded_df

In [None]:
train_dataset = encode_dataset(scale_dataset(train_dataset), CATEGORICAL_COLUMNS)
test_dataset = encode_dataset(scale_dataset(test_dataset), CATEGORICAL_COLUMNS)

encoder = LabelEncoder()
train_dataset["NObeyesdad"] = encoder.fit_transform(train_dataset["NObeyesdad"])
test_dataset["NObeyesdad"] = encoder.transform(test_dataset["NObeyesdad"])

## Create common columns after encode

In [None]:
# Find columns present in test_dataset but not in train_dataset
missing_in_train = set(test_dataset.columns) - set(train_dataset.columns)
# Add missing columns to train_dataset
for col in missing_in_train:
    train_dataset[col] = False

# Find columns present in train_dataset but not in test_dataset
missing_in_test = set(train_dataset.columns) - set(test_dataset.columns)
# Add missing columns to test_dataset
for col in missing_in_test:
    test_dataset[col] = False

# Ensure the columns are in the same order
train_dataset = train_dataset[test_dataset.columns]

## Split features and labels

In [None]:
X_train = train_dataset.drop("NObeyesdad", axis=1)
y_train = train_dataset["NObeyesdad"]

X_test = test_dataset.drop("NObeyesdad", axis=1)
y_test = test_dataset["NObeyesdad"]

## Train models

In [None]:
def fit_and_find_best_model(list_of_models, X_train, y_train, X_test, y_test):
    accuracies = [0.0] * len(list_of_models)

    for i, model in enumerate(tqdm(list_of_models, ncols=PROGRESS_BAR_LENGTH)):
        model.fit(X=X_train, y=y_train)
        accuracies[i] = accuracy_score(y_test, model.predict(X_test))

### XGBoost

In [None]:
param_grid = {
    "n_estimators": list(range(10, 101, 5)),
    "max_depth": list(range(2, 16, 2)),
    "min_child_weight": [1, 3, 5, 7],
    "booster": ["gbtree", "gblinear", "dart"],
    "learning_rate": [0.05, 0.10, 0.15], 
}

xgb_models = [
    XGBClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        min_child_weight=params["min_child_weight"],
        booster=params["booster"],
        learning_rate=params["learning_rate"],
        random_state=SEED
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(xgb_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = xgb_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

### LightGBM

In [None]:
param_grid = {
    "n_estimators": list(range(10, 101, 5)),
    "max_depth": list(range(2, 16, 2)),
    "min_child_weight": [1, 3, 5, 7],
    "boosting_type": ["gbdt", "dart"],
    "learning_rate": [0.05, 0.10, 0.15], 
}

lgbm_models = [
    LGBMClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        min_child_weight=params["min_child_weight"],
        boosting_type=params["boosting_type"],
        learning_rate=params["learning_rate"],
        random_state=SEED
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(lgbm_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = lgbm_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

### CatBoost

In [None]:
param_grid = {
    "iterations": [100, 250, 500, 1000],
    "depth": list(range(1, 11)),
    "l2_leaf_reg": [1, 3, 5, 10, 100],
    "loss_function": ["MultiClass", "MultiClassOneVsAll"],
    "border_count": [5, 10, 32, 50, 100],
    "learning_rate": [0.001, 0.01, 0.1, 0.3], 
}

cb_models = [
    CatBoostClassifier(
        iterations=params["iterations"],
        depth=params["depth"],
        l2_leaf_reg=params["l2_leaf_reg"],
        loss_function=params["loss_function"],
        border_count=params["border_count"],
        learning_rate=params["learning_rate"],
        classes_count=len(encoder.classes_),
        random_state=SEED
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(cb_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = cb_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

### RandomForest

In [None]:
param_grid = {
    "n_estimators": list(range(10, 101, 5)),
    "criterion": ["entropy", "gini", "log_loss"],
    "max_depth": list(range(2, 16, 2)),
}

rf_models = [
    RandomForestClassifier(
        n_estimators=params["n_estimators"],
        criterion=params["criterion"],
        max_depth=params["max_depth"],
        random_state=SEED
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(rf_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = rf_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

### Support Vector Machine

In [None]:
param_grid = {
    "C": list(np.linspace(0.1, 1, 10)),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
}

svc_models = [
    SVC(
        C=params["C"],
        kernel=params["kernel"],
        random_state=SEED
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(svc_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = svc_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

### k-NN

In [None]:
param_grid = {
    "n_neighbors": list(range(1, 31, 3)),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "metric": ["cityblock", "euclidean", "l1", "l2", "manhattan", "minkowski"],
}

knn_models = [
    KNeighborsClassifier(
        n_neighbors=params["n_neighbors"],
        weights=params["weights"],
        algorithm=params["algorithm"],
        metric=params["metric"]
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(knn_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = knn_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()

### Logistic regression

In [None]:
param_grid = {
    "C": list(np.linspace(0.1, 1, 10)),
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
}

lr_models = [
    LogisticRegression(
        C=params["C"],
        solver=params["solver"],
        random_state=SEED
    )
    for params in ParameterGrid(param_grid=param_grid)
]

In [None]:
accuracies = fit_and_find_best_model(lr_models, X_train, y_train, X_test, y_test)

In [None]:
best_model = lr_models[np.argmax(accuracies)]

# Validate on test data
y_pred = best_model.predict(X_test)
print(f"Accuracy = {accuracy_score(y_test, y_pred)}")
print("Classification Report: ")
print(classification_report(y_test, y_pred, target_names=list(encoder.classes_)))
print("Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=encoder.classes_).plot()
plt.xticks(rotation=90)
plt.grid(False)
plt.show()