# Preprocessing

## Library Imports

In [None]:
import os
import pickle

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

## Importing the Dataset

In [None]:
csv_path = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/datasets/obesityData.csv"

ob_df = pd.read_csv(csv_path) # ob_df --> obesity dataframe

## Splitting the Dataset
The dataset is split into a training set and test set with a ratio of 70% for the training set size and 30% for the test set size. The split is stratified according to the obesity level.

In [None]:
RAND_STATE = 0

X = ob_df.drop(columns=["NObeyesdad"])
y = ob_df["NObeyesdad"]

# Split the data
X_train_unencoded, X_test_unencoded, y_train_unencoded, y_test_unencoded = train_test_split(X, y, test_size=0.3, train_size=0.7, stratify=y, random_state=RAND_STATE)

In [None]:
X_train_unencoded

In [None]:
X_test_unencoded

In [None]:
y_train_unencoded

In [None]:
y_test_unencoded

## Feature Scaling
Standardisation is used to scale the numerical features. Note that the scaler is fitted to the train set, not the test set, to prevent "leaking" of the test set.

In [None]:
numerical_features = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

scaler = StandardScaler().fit(X_train_unencoded[numerical_features])

X_train = X_train_unencoded.copy()
X_train[numerical_features] = scaler.transform(X_train_unencoded[numerical_features])
X_train

In [None]:
X_test = X_test_unencoded.copy()
X_test[numerical_features] = scaler.transform(X_test_unencoded[numerical_features])
X_test

## Categorical Feature Encoding
The categorical features need to be encoded. We will use label encoding and one-hot encoding.

### Target Feature
The target feature (obesity level) is encoded using label encoding.

In [None]:
target_le = LabelEncoder()
target_le.fit(ob_df["NObeyesdad"]);

In [None]:
def le_encode(y_unencoded):
    y = y_unencoded.copy()
    y.loc[:] = target_le.transform(y_unencoded)
    return y.astype("int")

In [None]:
y_train = le_encode(y_train_unencoded)
y_train

In [None]:
y_test = le_encode(y_test_unencoded)
y_test

### Non-Target Features
We now encode the other categorical features. In this case, they all happen to be nominal, hence one-hot encoding is used.
The first column for each feature after encoding is dropped.
First, we create the one-hot encoder:

In [None]:
nominal_features = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS"
]

nominal_ohe = OneHotEncoder(drop="first", sparse_output=False)
nominal_ohe.fit(ob_df[nominal_features]);

Next, we transform the categorical features' values in `X_train` and `X_test` using the encoder.
The original `X_train` and `X_test` dataframes are then modified to use the columns from the one-hot encoding. The old non-encoded features are dropped.

In [None]:
def ohe_transform(X):
    ohe_train_transformed = nominal_ohe.transform(X[nominal_features])

    # Convert back into a dataframe.
    pd_train_transformed = pd.DataFrame(ohe_train_transformed, columns=nominal_ohe.get_feature_names_out(), index=X.index)
    
    return pd.concat([X.drop(nominal_features, axis=1), pd_train_transformed], axis=1)

In [None]:
X_train = ohe_transform(X_train)
X_train

In [None]:
X_test = ohe_transform(X_test)
X_test

## Export
Now, we export the preprocessed data to CSV files.

In [None]:
datasets_folder = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/datasets"
def save_df(df, filename):
    path = os.path.join(datasets_folder, filename)
    df.to_csv(path)
    print(f"Dataframe saved to \"{path}\"")

In [None]:
save_df(X_train, "obesity_X_train.csv")
save_df(X_test, "obesity_X_test.csv")
save_df(y_train, "obesity_y_train.csv")
save_df(y_test, "obesity_y_test.csv")

The non-preprocessed data after splitting are also saved:

In [None]:
save_df(X_train_unencoded, "obesity_X_train_unencoded.csv")
save_df(X_test_unencoded, "obesity_X_test_unencoded.csv")
save_df(y_train_unencoded, "obesity_y_train_unencoded.csv")
save_df(y_test_unencoded, "obesity_y_test_unencoded.csv")

We also need to export the encoders so that they can be used in other notebooks. This is done through pickling:

In [None]:
def save_encoder(encoder, filename):
    file_path = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/encoders/{filename}"
    with open(file_path, 'wb') as file: 
        pickle.dump(encoder, file)
    file.close()
    print(f"Encoder saved to {file_path}")

In [None]:
save_encoder(scaler, "scaler.pkl")

In [None]:
save_encoder(target_le, "target_le.pkl")

In [None]:
save_encoder(nominal_ohe, "nominal_ohe.pkl")