In [1]:
import os

In [2]:
%pwd

'/Users/anishmitagar/Documents/repos/summer_2023_projects/end-to-end-diabetes-project/research'

In [3]:
os.chdir('/Users/anishmitagar/Documents/repos/summer_2023_projects/end-to-end-diabetes-project')

In [4]:
%pwd

'/Users/anishmitagar/Documents/repos/summer_2023_projects/end-to-end-diabetes-project'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from box.config_box import ConfigBox


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    save_path_best_acc: Path
    save_path_best_f1: Path
    neural_network_params: ConfigBox
    random_forest_params: ConfigBox
    xg_boost_params: ConfigBox
    cat_boost_params: ConfigBox
    ada_boost_params: ConfigBox

In [6]:
from mlClassifier.constants import *
from mlClassifier.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        

        create_directories([config.root_dir])
        create_directories([config.save_path_best_acc])
        create_directories([config.save_path_best_f1])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            save_path_best_acc = config.save_path_best_acc,
            save_path_best_f1 = config.save_path_best_f1,
            neural_network_params = self.params.neural_network_params,
            random_forest_params = self.params.random_forest_params,
            xg_boost_params = self.params.xg_boost_params,
            cat_boost_params = self.params.cat_boost_params,
            ada_boost_params = self.params.ada_boost_params
        )

        return model_trainer_config


In [8]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
import torch.optim as optim
import tqdm
import copy
from sklearn import metrics 

In [9]:
class Deep(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(15, 15)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(15, 15)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(15, 15)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(15, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [10]:
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight.data)

In [11]:
def model_train(device, model, X_train, y_train, X_val, y_val, epochs, lr, batch_size):
    model.to(device)
    X_val = X_val.to(device)
    y_val = y_val.to(device)
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    optimizer = optim.Adam(model.parameters(), lr=lr)

    n_epochs = epochs   # number of epochs to run
    batch_size = batch_size  # size of each batch
    batch_start = torch.arange(0, len(X_train), batch_size)

    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights_for_acc = None
    f1_for_best_acc, precision_for_best_acc, recall_for_best_acc = None, None, None

    best_f1 = - np.inf   # init to negative infinity
    best_weights_for_f1 = None
    acc_for_best_f1, precision_for_best_f1, recall_for_best_f1 = None, None, None

    best_precision = - np.inf   # init to negative infinity
    best_weights_for_precision = None
    acc_for_best_precision, f1_for_best_precision, recall_for_best_precision = None, None, None

    best_recall = - np.inf   # init to negative infinity
    best_weights_for_recall = None
    acc_for_best_recall, f1_for_best_recall, precision_for_best_recall = None, None, None

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                X_batch = X_batch.to(device)
                y_batch = y_train[start:start+batch_size]
                y_batch = y_batch.to(device)
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        y_real = y_val.cpu().numpy()
        y_try = y_pred.round().detach().cpu().numpy()
        # print(metrics.accuracy_score(y_real, y_try), metrics.f1_score(y_real, y_try), metrics.precision_score(y_real, y_try), metrics.recall_score(y_real, y_try))
        # acc = (y_pred.round() == y_val).float().mean()
        # acc = float(acc)
        acc = metrics.accuracy_score(y_real, y_try)
        f1 = metrics.f1_score(y_real, y_try)
        precision = metrics.precision_score(y_real, y_try)
        recall = metrics.recall_score(y_real, y_try)
        
        #print(acc)
        if acc > best_acc:
            best_acc = acc
            f1_for_best_acc, precision_for_best_acc, recall_for_best_acc = f1, precision, recall
            best_weights_for_acc = copy.deepcopy(model.state_dict())

        if f1 > best_f1:
            best_f1 = f1
            acc_for_best_f1, precision_for_best_f1, recall_for_best_f1 = acc, precision, recall
            best_weights_for_f1 = copy.deepcopy(model.state_dict())

        if precision > best_precision:
            best_precision = precision
            acc_for_best_precision, f1_for_best_precision, recall_for_best_precision = acc, f1, recall
            best_weights_for_precision = copy.deepcopy(model.state_dict())

        if recall > best_recall:
            best_recall = recall
            acc_for_best_recall, f1_for_best_recall, precision_for_best_recall = acc, f1, precision
            best_weights_for_recall = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    # model.load_state_dict(best_weights)
    return [[best_acc, f1_for_best_acc, precision_for_best_acc, recall_for_best_acc], 
            [best_f1, acc_for_best_f1, precision_for_best_f1, recall_for_best_f1],
            [best_precision, acc_for_best_precision, f1_for_best_precision, recall_for_best_precision],
            [best_recall, acc_for_best_recall, f1_for_best_recall, precision_for_best_recall]], best_weights_for_acc, best_weights_for_f1, best_weights_for_precision, best_weights_for_recall

In [12]:
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier)
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


In [13]:
def train_Grid_CV_RandomForestClassifier(params, X, y):
    best_acc, best_f1, best_precision, best_recall = -np.inf, -np.inf, -np.inf, -np.inf
    best_model_for_acc, best_model_for_f1, best_model_for_precision, best_model_for_recall = None, None, None, None
    for n_estimators in params.n_estimators:
        for criterion in params.criterion:
            for max_depth in params.max_depth:
                clf = RandomForestClassifier(n_estimators = n_estimators, criterion=criterion, max_depth=max_depth)  
                np.random.seed(0)
                cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
                acc, f1, precision, recall = 0, 0, 0, 0
                for (train, test), i in zip(cv.split(X, y), range(10)):
                    clf.fit(X.iloc[train], y.iloc[train])
                    y_pred = clf.predict(X.iloc[test])
                    y_test = y.iloc[test]
                    acc, f1, precision, recall = metrics.accuracy_score(y_test, y_pred) + acc, metrics.f1_score(y_test, y_pred) + f1, metrics.precision_score(y_test, y_pred) + precision, metrics.recall_score(y_test, y_pred) + recall
                acc, f1, precision, recall = acc/10, f1/10, precision/10, recall/10
                if acc > best_acc:
                    best_acc = acc
                    best_model_for_acc = clf
                if f1 > best_f1:
                    best_f1 = f1
                    best_model_for_f1 = clf
                if precision > best_precision:
                    best_precision = precision
                    best_model_for_precision = clf
                if recall > best_recall:
                    best_recall = recall
                    best_model_for_recall = clf
                print(f"Completed for {n_estimators, criterion, max_depth}")
    return {"best_acc": (best_acc, best_model_for_acc),
            "best_f1": (best_f1, best_model_for_f1),
            "best_precision": (best_recall, best_model_for_precision),
            "best_recall": (best_precision, best_model_for_recall)}

def train_Grid_CV_XGBClassifier(params, X, y):
    best_acc, best_f1, best_precision, best_recall = -np.inf, -np.inf, -np.inf, -np.inf
    best_model_for_acc, best_model_for_f1, best_model_for_precision, best_model_for_recall = None, None, None, None
    for n_estimators in params.n_estimators:
        for learning_rate in params.learning_rate:
            for max_depth in params.max_depth:
                clf = XGBClassifier(n_estimators = n_estimators, learning_rate=learning_rate, max_depth=max_depth)  
                np.random.seed(0)
                cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
                acc, f1, precision, recall = 0, 0, 0, 0
                for (train, test), i in zip(cv.split(X, y), range(10)):
                    clf.fit(X.iloc[train], y.iloc[train])
                    y_pred = clf.predict(X.iloc[test])
                    y_test = y.iloc[test]
                    acc, f1, precision, recall = metrics.accuracy_score(y_test, y_pred) + acc, metrics.f1_score(y_test, y_pred) + f1, metrics.precision_score(y_test, y_pred) + precision, metrics.recall_score(y_test, y_pred) + recall
                acc, f1, precision, recall = acc/10, f1/10, precision/10, recall/10
                if acc > best_acc:
                    best_acc = acc
                    best_model_for_acc = clf
                if f1 > best_f1:
                    best_f1 = f1
                    best_model_for_f1 = clf
                if precision > best_precision:
                    best_precision = precision
                    best_model_for_precision = clf
                if recall > best_recall:
                    best_recall = recall
                    best_model_for_recall = clf
                print(f"Completed for {n_estimators, learning_rate, max_depth}")
    return {"best_acc": (best_acc, best_model_for_acc),
            "best_f1": (best_f1, best_model_for_f1),
            "best_precision": (best_recall, best_model_for_precision),
            "best_recall": (best_precision, best_model_for_recall)}

def train_Grid_CV_CatBoostClassifier(params, X, y):
    best_acc, best_f1, best_precision, best_recall = -np.inf, -np.inf, -np.inf, -np.inf
    best_model_for_acc, best_model_for_f1, best_model_for_precision, best_model_for_recall = None, None, None, None
    for iterations in params.iterations:
        for learning_rate in params.learning_rate:
            for depth in params.depth:
                clf = CatBoostClassifier(iterations = iterations, learning_rate=learning_rate, depth=depth, verbose=False)  
                np.random.seed(0)
                cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
                acc, f1, precision, recall = 0, 0, 0, 0
                for (train, test), i in zip(cv.split(X, y), range(10)):
                    clf.fit(X.iloc[train], y.iloc[train])
                    y_pred = clf.predict(X.iloc[test])
                    y_test = y.iloc[test]
                    acc, f1, precision, recall = metrics.accuracy_score(y_test, y_pred) + acc, metrics.f1_score(y_test, y_pred) + f1, metrics.precision_score(y_test, y_pred) + precision, metrics.recall_score(y_test, y_pred) + recall
                acc, f1, precision, recall = acc/10, f1/10, precision/10, recall/10
                if acc > best_acc:
                    best_acc = acc
                    best_model_for_acc = clf
                if f1 > best_f1:
                    best_f1 = f1
                    best_model_for_f1 = clf
                if precision > best_precision:
                    best_precision = precision
                    best_model_for_precision = clf
                if recall > best_recall:
                    best_recall = recall
                    best_model_for_recall = clf
                print(f"Completed for {iterations, learning_rate, depth}")
    return {"best_acc": (best_acc, best_model_for_acc),
            "best_f1": (best_f1, best_model_for_f1),
            "best_precision": (best_recall, best_model_for_precision),
            "best_recall": (best_precision, best_model_for_recall)}

def train_Grid_CV_AdaBoostClassifier(params, X, y):
    best_acc, best_f1, best_precision, best_recall = -np.inf, -np.inf, -np.inf, -np.inf
    best_model_for_acc, best_model_for_f1, best_model_for_precision, best_model_for_recall = None, None, None, None
    for n_estimators in params.n_estimators:
        for learning_rate in params.learning_rate:
            clf = AdaBoostClassifier(n_estimators = n_estimators, learning_rate=learning_rate)  
            np.random.seed(0)
            cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
            acc, f1, precision, recall = 0, 0, 0, 0
            for (train, test), i in zip(cv.split(X, y), range(10)):
                clf.fit(X.iloc[train], y.iloc[train])
                y_pred = clf.predict(X.iloc[test])
                y_test = y.iloc[test]
                acc, f1, precision, recall = metrics.accuracy_score(y_test, y_pred) + acc, metrics.f1_score(y_test, y_pred) + f1, metrics.precision_score(y_test, y_pred) + precision, metrics.recall_score(y_test, y_pred) + recall
            acc, f1, precision, recall = acc/10, f1/10, precision/10, recall/10
            if acc > best_acc:
                best_acc = acc
                best_model_for_acc = clf
            if f1 > best_f1:
                best_f1 = f1
                best_model_for_f1 = clf
            if precision > best_precision:
                best_precision = precision
                best_model_for_precision = clf
            if recall > best_recall:
                best_recall = recall
                best_model_for_recall = clf
            print(f"Completed for {n_estimators, learning_rate}")
    return {"best_acc": (best_acc, best_model_for_acc),
            "best_f1": (best_f1, best_model_for_f1),
            "best_precision": (best_recall, best_model_for_precision),
            "best_recall": (best_precision, best_model_for_recall)}

In [14]:
import collections

In [15]:
import os
import sys
import dill
import pickle

def save_object(file_path, obj):

    dir_path = os.path.dirname(file_path)

    os.makedirs(dir_path, exist_ok=True)

    with open(file_path, "wb") as file_obj:
        dill.dump(obj, file_obj)
    

    
def load_object(file_path):

    with open(file_path, "rb") as file_obj:
        return pickle.load(file_obj)

In [16]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):

        df = pd.read_csv(self.config.data_path)

        X, y = df.drop(columns=['diabetes'],axis=1), df['diabetes']

        X_tensor = torch.tensor(X.to_numpy(), dtype=torch.float32)
        y_tensor = torch.tensor(y.to_numpy(), dtype=torch.float32).reshape(-1, 1)

        best_acc = -np.inf
        best_acc_model = None
        best_f1 = -np.inf
        best_f1_model = None
       
        print("Training Neural Network")
        nn_results = self._train_nn(self.config.neural_network_params, X_tensor, y_tensor)
        if nn_results["best_acc"][0] > best_acc:
            best_acc, best_acc_model = nn_results["best_acc"][0], nn_results["best_acc"][1]
        if nn_results["best_f1"][0] > best_f1:
            best_f1, best_f1_model = nn_results["best_f1"][0], nn_results["best_f1"][1]  
        print("Finshed Training Neural Network")

        print("Training Random Forest")
        random_forest_results = train_Grid_CV_RandomForestClassifier(self.config.random_forest_params, X, y)
        if random_forest_results["best_acc"][0] > best_acc:
            best_acc, best_acc_model = random_forest_results["best_acc"][0], random_forest_results["best_acc"][1] 
        if random_forest_results["best_f1"][0] > best_f1:
            best_f1, best_f1_model = random_forest_results["best_f1"][0], random_forest_results["best_f1"][1]  
        print("Finished training Random Forest")

        print("Training XGBClassifier")
        xgb_results = train_Grid_CV_XGBClassifier(self.config.xg_boost_params, X, y)
        if xgb_results["best_acc"][0] > best_acc:
            best_acc, best_acc_model = xgb_results["best_acc"][0], xgb_results["best_acc"][1]  
        if xgb_results["best_f1"][0] > best_f1: 
            best_f1, best_f1_model = xgb_results["best_f1"][0], xgb_results["best_f1"][1]
        print("Finished training XGBClassifier")

        print("Training CatBoostClassifier")
        cat_boost_results = train_Grid_CV_CatBoostClassifier(self.config.cat_boost_params, X, y)
        if cat_boost_results["best_acc"][0] > best_acc:
            best_acc, best_acc_model = cat_boost_results["best_acc"][0], cat_boost_results["best_acc"][1]  
        if cat_boost_results["best_f1"][0] > best_f1:
            best_f1, best_f1_model = cat_boost_results["best_f1"][0], cat_boost_results["best_f1"][1]  
        print("Finished training CatBoostClassifier")

        print("Training AdaBoostClassifier")
        ada_boost_results = train_Grid_CV_AdaBoostClassifier(self.config.ada_boost_params, X, y)
        if ada_boost_results["best_acc"][0] > best_acc: 
            best_acc, best_acc_model = ada_boost_results["best_acc"][0], ada_boost_results["best_acc"][1]  
        if ada_boost_results["best_f1"][0] > best_f1:
            best_f1, best_f1_model = ada_boost_results["best_f1"][0], ada_boost_results["best_f1"][1]  
        print("Finished training AdaBoostClassifier")

        if type(best_acc_model) is collections.OrderedDict:
            torch.save(best_acc_model, "artifacts/model_trainer/best_acc_model/model.pth")
        else:
            save_object(file_path="artifacts/model_trainer/best_acc_model/model.pkl", obj = best_acc_model)

        if type(best_f1_model) is collections.OrderedDict:
            torch.save(best_f1_model, "artifacts/model_trainer/best_f1_model/model.pth")
        else:
            save_object(file_path="artifacts/model_trainer/best_f1_model/model.pkl", obj = best_acc_model)


    @staticmethod
    def _train_nn(params, X_tensor, y_tensor):
        best_weights_for_acc, best_weights_for_f1, best_weights_for_precision, best_weights_for_recall = None, None, None, None
        best_avg_acc, best_avg_f1, best_avg_precision, best_avg_recall = - np.inf, - np.inf, - np.inf, - np.inf
        device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
        for epochs in params.epochs:
            for lr in params.lrs:
                for batch_size in params.batch_sizes:
                    print(epochs, lr, batch_size)
                    warnings.filterwarnings('ignore')
                    # define 5-fold cross validation test harness
                    np.random.seed(0)
                    kfold = StratifiedKFold(n_splits=10, shuffle=True)
                    #cv_scores = []
                    sum_results = np.zeros((4, 4))
                    for train, test in kfold.split(X_tensor, y_tensor):
                        torch.manual_seed(0)
                        model = Deep().apply(weights_init)
                        results, weights_for_acc, weights_for_f1, weights_for_precision, weights_for_recall = model_train(device, model, X_tensor[train], y_tensor[train], X_tensor[test], y_tensor[test], epochs, lr, batch_size)
                        sum_results = np.add(sum_results, np.array(results))
                    sum_results = sum_results/10
                    if sum_results[0][0] > best_avg_acc:
                        best_avg_acc = sum_results[0][0]
                        best_weights_for_acc = weights_for_acc
                    if sum_results[1][0] > best_avg_f1:
                        best_avg_f1 = sum_results[1][0]
                        best_weights_for_f1 = weights_for_f1
                    if sum_results[2][0] > best_avg_precision:
                        best_avg_precision = sum_results[2][0]
                        best_weights_for_precision = weights_for_precision
                    if sum_results[3][0] > best_avg_recall:
                        best_avg_recall = sum_results[3][0]
                        best_weights_for_recall = weights_for_recall
        return {"best_acc": (best_avg_acc, best_weights_for_acc),
                "best_f1": (best_avg_f1, best_weights_for_f1),
                "best_precision": (best_avg_recall, best_weights_for_precision),
                "best_recall": (best_avg_precision, best_weights_for_recall)}
    



In [17]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2023-07-22 10:34:32,870: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-07-22 10:34:32,872: INFO: common: yaml file: params.yaml loaded successfully]
[2023-07-22 10:34:32,873: INFO: common: created directory at: artifacts]
[2023-07-22 10:34:32,873: INFO: common: created directory at: artifacts/model_trainer]
[2023-07-22 10:34:32,874: INFO: common: created directory at: artifacts/model_trainer/best_acc_model]
[2023-07-22 10:34:32,874: INFO: common: created directory at: artifacts/model_trainer/best_f1_model]
Training Neural Network
50 1e-05 2500
Finshed Training Neural Network
Training Random Forest
Completed for (50, 'gini', 10)
Finished training Random Forest
Training XGBClassifier
Completed for (8, 0.1, 3)
Finished training XGBClassifier
Training CatBoostClassifier
Completed for (30, 0.01, 6)
Finished training CatBoostClassifier
Training AdaBoostClassifier
Completed for (8, 0.1)
Finished training AdaBoostClassifier
