<a href="https://colab.research.google.com/github/AshAninze/Uni-Projects/blob/main/Estimation_of_Obesity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

This code implements a machine learning pipeline for predicting obesity levels. It includes data preprocessing, feature engineering, model training with hyperparameter tuning, and evaluation using cross-validation. The pipeline explores multiple models, including SVC, Random Forest, Logistic Regression, and Decision Tree, to identify the best-performing model for this classification task.

In [1]:
# importing all needed packages for ml pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, KFold, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from timeit import default_timer as timer
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")  # ignoring any warnings
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import os
current_directory = os.getcwd()  # gets the current directory to be able to find the data file


class DataSciencePipeline:
    """ A class to create multiple machine learning models on a given dataset

      This class encapsulates the entire data science pipeline, including:
      - Data Loading
      - Data preprocessing
      - Feature Engineering
      - Data splitting
      - Hyperparameter tuning with cross validation
      - Model training and evaluation
    """

    def __init__(self):
        """ Initializes the DataSciencePipeline with the given dataset.
        Attributes:
          data: The pandas DataFrame holding the dataset.
          X: Features (independent variables) after preprocessing.
          y: Target variable (dependent variable).
          no_X: Features without scaling, used for comparison.
          no_y: Target variable without scaling.
          X_train: Training features variables.
          X_test: Testing features variables.
          y_train: Training target variable.
          y_test: Testing target variable.
        """
        self.data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Postgrad/Comp4AI/ObesityDataSet_raw_and_data_sinthetic.csv')
        self.X = None
        self.y = None
        self.no_X = None
        self.no_y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def preprocess_data(self, categorical_columns, numerical_columns):
        """
        Preprocesses the data by rounding any numerical values to the nearest integer
        and then performing label encoding on categorical features.

        Attributes:
          data['Height']: Height in m
          data[column]: Given column name in list

        Args:
            categorical_columns: A list of column names that represent categorical features.
            numerical_columns: A list of column names that represent numerical features.

        Returns: Label encoded dataset with numerical values rounded to the nearest integer.
        """
        #self.data['Height'] = self.data['Height'].round(2)  # normalise the height value to x.xx

        #for column in numerical_columns:
            #self.data[column] = self.data[column].round()

        for column in categorical_columns:  # label encoding to avoid high dimensionality with one-hot encoding
            self.data[column] = self.data[column].astype(str)
            self.data[column] = LabelEncoder().fit_transform(self.data[column])

        # update feature and target variables, create separate copies for unscaled data.
        self.X = self.data.drop('NObeyesdad', axis=1)
        self.y = self.data['NObeyesdad']
        self.no_X = self.data.drop('NObeyesdad', axis=1)
        self.no_y = self.data['NObeyesdad']

        print(self.data.head())  # print the encoded dataset

    def feature_engineering(self):
        """
        Creates a column called BMI which is the body mass index.

        BMI is calculated by weight divided by height squared.

          Attributes:
            data['BMI']: Body Mass Index
            data['Weight']: Weight in kg
            data['Height']: Height in m
        """
        self.data['BMI'] = self.data['Weight'] / (self.data['Height'] ** 2)

    def split_data(self, test_size=0.2, random_state=42):
        """
        Splits the data into training and testing sets.

        Args:
            test_size: The proportion of the dataset to include in the test split.
            random_state: Controls the shuffling applied to the data before applying the split.
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state
        )

    def scale_data(self):  # scale the data after splitting to prevent data leakage.
        """
        Scales the data using the StandardScaler.
        Data scaling is the process of adjusting the range of values in features to a
        common scale.
        """
        self.no_X_train = self.X_train
        self.no_X_test = self.X_test
        sc = StandardScaler()
        self.X_train = sc.fit_transform(self.X_train)
        self.X_test = sc.transform(self.X_test)

    def hyperparameter_tuning(self, models):
        """
        Tunes the hyperparameters of a given model using RandomSearchCV. Cross validates
         for the detection of overfitting.

        Attributes:
          model: The name of the machine learning model to tune.
          param_grid: A dictionary of hyperparameters and their possible values.

        Args:
          models: The list of the machine learning model to tune and train.

        Returns:
          A dictionary containing the best estimator, average CV score, accuracy score
          and time taken to train.
        """
        param_grids = {  # parameter ranges for each model
            SVC: {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf'], 'gamma': uniform(0.0001, 10)},
            RandomForestClassifier: {'n_estimators': randint(50, 200), 'max_depth': [None, 5, 10],
                                     'min_samples_split': randint(2, 10)},
            LogisticRegression: {'C': uniform(0.1, 10), 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga']},
            DecisionTreeClassifier: {'max_depth': [None, 5, 10], 'min_samples_split': randint(2, 10)}
        }

        for model in models:  # trains model in param_grid
            param_distributions = param_grids.get(type(model))  # retrieve the parameter distribution for the model
            random_search = RandomizedSearchCV(
                model, param_distributions, n_iter=50, cv=5, scoring='accuracy', random_state=42, n_jobs=-1
            )
            start_time = timer()
            random_search.fit(self.X_train, self.y_train)
            model_time = timer() - start_time
            scores = cross_val_score(model, self.X, self.y,
                                     cv=10)  # trains cross-validation scores for model comparisons for over fitting
            average_score = scores.mean()
            best_model = random_search.best_estimator_
            y_pred = best_model.predict(self.X_test)
            acc = accuracy_score(self.y_test, y_pred)
            precision = precision_score(self.y_test, y_pred, average='weighted')
            recall = recall_score(self.y_test, y_pred, average='weighted')
            f1 = f1_score(self.y_test, y_pred, average='weighted')

            # printing model details
            print(f"Model: {model.__class__.__name__}") # gets the name of the object(model)
            print("Best hyperparameters:", random_search.best_params_)
            print(f"Accuracy: {acc * 100:.2f}%\n")
            print(f"Precision: {precision * 100:.2f}%\n")
            print(f"Recall: {recall * 100:.2f}%\n")
            print(f"F1-score: {f1 * 100:.2f}%\n")
            print(f"Average CV accuracy: {average_score * 100:.2f}%\n")
            print(f"Timing: {model_time:.2f}s\n")


    def train(self, models):
        """
        Trains the given model on the training data.

        Attributes:
        model_name: The name of the machine learning model.
        y_pred: The predicted target variable.
        acc: The accuracy of the model.
        model_time: The time taken to train the model.

        Args:
            models: The list of machine learning models to train.

        Returns: Scaled and unscaled accuracy scores for each model with time taken
            to train.
            """
        for model_name in models:  # trains models on unscaled data to be able to compare the differences
            start_time = timer()
            model_name.fit(self.no_X_train, self.y_train)
            model_time = timer() - start_time
            self.y_pred = model_name.predict(self.no_X_test)
            acc = accuracy_score(self.y_test, self.y_pred)
            print(
                f" Model: {model_name.__class__.__name__} Accuracy with no scaling: {acc * 100:.2f}% Timing: {model_time:.2f}s\n")

        for model_name in models:  # trains models within a list
            start_time = timer()
            model_name.fit(self.X_train, self.y_train)
            model_time = timer() - start_time
            self.y_pred = model_name.predict(self.X_test)
            acc = accuracy_score(self.y_test, self.y_pred)
            print(
                f" Model: {model_name.__class__.__name__} Accuracy with scaling: {acc * 100:.2f}% Timing: {model_time:.2f}s\n ")

    def feature_importance(self, models):
        for model_name in models:
            importances = model_name.fit(self.X_train, self.y_train).feature_importances_
            featureIm = pd.DataFrame({'Feature': self.X.columns, 'Importance': importances})
            plt.figure(figsize=(10, 6))
            plt.barh(featureIm['Feature'], featureIm['Importance'])
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.title('Feature Importances')
            plt.show()


def main():
    """
    Main function to run the data science pipeline.
    """
    models = [SVC(), RandomForestClassifier(), LogisticRegression(), DecisionTreeClassifier()]
    categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS',
                           'NObeyesdad']
    numerical_columns = ['Age', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI']
    pipeline = DataSciencePipeline()
    pipeline.feature_engineering()
    pipeline.preprocess_data(categorical_columns, numerical_columns)
    pipeline.split_data()
    pipeline.scale_data()
    pipeline.train(models)
    pipeline.hyperparameter_tuning(models)
    # pipeline.feature_importance(models = {RandomForestClassifier(), XGBClassifier()}) ignore


main()


   Gender   Age  Height  Weight  family_history_with_overweight  FAVC  FCVC  \
0       0  21.0    1.62    64.0                               1     0   2.0   
1       0  21.0    1.52    56.0                               1     0   3.0   
2       1  23.0    1.80    77.0                               1     0   2.0   
3       1  27.0    1.80    87.0                               0     0   3.0   
4       1  22.0    1.78    89.8                               0     0   2.0   

   NCP  CAEC  SMOKE  CH2O  SCC  FAF  TUE  CALC  MTRANS  NObeyesdad        BMI  
0  3.0     2      0   2.0    0  0.0  1.0     3       3           1  24.386526  
1  3.0     2      1   3.0    1  3.0  0.0     2       3           1  24.238227  
2  3.0     2      0   2.0    0  2.0  1.0     1       3           1  23.765432  
3  3.0     2      0   2.0    0  2.0  0.0     1       4           5  26.851852  
4  1.0     2      0   2.0    0  0.0  0.0     2       3           6  28.342381  
 Model: SVC Accuracy with no scaling: 74.47% 