# Data Training

## Library Imports

In [None]:
import os

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

## Importing the Dataset

In [None]:
csv_path = f"{os.getcwd()}/datasets/obesityData.csv"

ob_df = pd.read_csv(csv_path) # ob_df --> obesity dataframe

## Preprocessing

### Splitting the Dataset

In [None]:
RAND_STATE = 0

X = ob_df.drop(columns=["NObeyesdad"])
y = ob_df["NObeyesdad"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, stratify=y, random_state=RAND_STATE)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

### Feature Scaling

In [None]:
numerical_features = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

# Fit the scaler to the train set, not the test set, to prevent "leaking" of the test set.
scaler = StandardScaler().fit(X_train[numerical_features])

X_train[numerical_features] = scaler.transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
X_train

In [None]:
X_test

### Categorical Feature Encoding

#### Target Feature

In [None]:
le = LabelEncoder()
le.fit(ob_df["NObeyesdad"])

y_train[:] = le.transform(y_train)
y_test[:] = le.transform(y_test)

In [None]:
y_train

In [None]:
y_test

#### Non-Target Features

In [None]:
nominal_features = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS"
]

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe.fit(ob_df[nominal_features]);

In [None]:
ohe_train_transformed = ohe.transform(X_train[nominal_features])
pd_train_transformed = pd.DataFrame(ohe_train_transformed, columns=ohe.get_feature_names_out(), index=X_train.index)

X_train = pd.concat([X_train.drop(nominal_features, axis=1), pd_train_transformed], axis=1)

In [None]:
ohe_train_transformed = ohe.transform(X_test[nominal_features])
pd_train_transformed = pd.DataFrame(ohe_train_transformed, columns=ohe.get_feature_names_out(), index=X_test.index)

X_test = pd.concat([X_test.drop(nominal_features, axis=1), pd_train_transformed], axis=1)

In [None]:
X_train

In [None]:
X_test