In [1]:
import os

In [2]:
%pwd

'/Users/divyanshu9871gmail.com/Desktop/ml-ops-holiday-package-prediction/notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/divyanshu9871gmail.com/Desktop/ml-ops-holiday-package-prediction'

In [5]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [6]:
train = pd.read_csv("./artifacts/data_transformation/train.csv")
test = pd.read_csv("./artifacts/data_transformation/test.csv")
train.shape, test.shape

((3421, 18), (1467, 18))

In [7]:
X_train = train.drop("ProdTaken", axis=1)
y_train = train["ProdTaken"]
X_test = test.drop("ProdTaken", axis=1)
y_test = test["ProdTaken"]

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop="first")

preprocessor = joblib.load("./artifacts/data_transformation/preprocessor.joblib")

In [9]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
X_train.shape, X_test.shape

((3421, 26), (1467, 26))

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

In [12]:
def compute_metrics(y_train, y_train_pred, y_test, y_test_pred):
    """
    Compute performance metrics for training and test predictions.

    Parameters:
    - y_train (array-like): True labels for training data.
    - y_train_pred (array-like): Predicted labels for training data.
    - y_test (array-like): True labels for test data.
    - y_test_pred (array-like): Predicted labels for test data.

    Returns:
    - metrics (dict): Dictionary containing metrics for both training and test data.
    """
    metrics = {
        "Training Set": {
            "Accuracy": accuracy_score(y_train, y_train_pred),
            "F1 Score": f1_score(y_train, y_train_pred, average="weighted"),
            "Precision": precision_score(y_train, y_train_pred, average="weighted"),
            "Recall": recall_score(y_train, y_train_pred, average="weighted"),
            "Roc Auc Score": roc_auc_score(y_train, y_train_pred),
        },
        "Test Set": {
            "Accuracy": accuracy_score(y_test, y_test_pred),
            "F1 Score": f1_score(y_test, y_test_pred, average="weighted"),
            "Precision": precision_score(y_test, y_test_pred, average="weighted"),
            "Recall": recall_score(y_test, y_test_pred, average="weighted"),
            "Roc Auc Score": roc_auc_score(y_test, y_test_pred),
        },
    }

    return metrics

In [13]:
# Define your models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boost": GradientBoostingClassifier(),
    "Adaboost": AdaBoostClassifier(),
    "Xgboost": XGBClassifier(),
    "SVC": SVC(),
    "Gaussian Naive Bayes": GaussianNB(),
}

# Fit and evaluate each model
for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute metrics
    metrics = compute_metrics(y_train, y_train_pred, y_test, y_test_pred)

    # Print metrics
    print("Model performance for Training set")
    for metric_name, value in metrics["Training Set"].items():
        print(f"- {metric_name}: {value:.4f}")

    print("----------------------------------")

    print("Model performance for Test set")
    for metric_name, value in metrics["Test Set"].items():
        print(f"- {metric_name}: {value:.4f}")

    print("=" * 35)
    print("\n")

Evaluating model: Logistic Regression
Model performance for Training set
- Accuracy: 0.8451
- F1 Score: 0.8188
- Precision: 0.8292
- Recall: 0.8451
- Roc Auc Score: 0.6373
----------------------------------
Model performance for Test set
- Accuracy: 0.8419
- F1 Score: 0.8177
- Precision: 0.8225
- Recall: 0.8419
- Roc Auc Score: 0.6371


Evaluating model: Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 Score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9018
- F1 Score: 0.9022
- Precision: 0.9027
- Recall: 0.9018
- Roc Auc Score: 0.8427


Evaluating model: Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 Score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9202
- F1 Score: 0.9132
- Precision: 0.9220
- Recall: 0.9202
- Roc Auc Score: 0.80

In [14]:
random_forest_params = {
    "max_depth": [5, 8, 15, None, 10],
    "max_features": [5, 7, "auto", 8],
    "min_samples_split": [2, 8, 15, 20],
    "n_estimators": [100, 200, 500, 1000],
}
xgboost_params = {
    "learning_rate": [0.1, 0.01],
    "max_depth": [5, 8, 12, 20, 30],
    "n_estimators": [100, 200, 300],
    "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4],
}
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"],
    "max_depth": [1, 2, 3, 4, 5],
    "max_features": ["auto", "sqrt", "log2"],
}

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
randomcv_models = [
    ("Random Forest", RandomForestClassifier(), random_forest_params),
    ("Xgboost", XGBClassifier(), xgboost_params),
    ("Decision Tree", DecisionTreeClassifier(), decision_tree_params),
]

In [17]:
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1,
    )
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=15, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_depth=15, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_depth=15, max_features=auto, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=8, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, max_features=8, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, max_features=auto, min_samples_split=15, n_estimators=1000; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_split=15, n_estimators=1000; total time=   0.0s
[CV] END max_depth=None, max_features=8, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, max_features=auto, min_samples_split=15, n_estimators=1000; total time=   0.0s
[CV] END max_depth=10, max_f

In [18]:
models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=1000, min_samples_split=2, max_features=8, max_depth=15
    ),
    "Xgboost": XGBClassifier(
        n_estimators=300, max_depth=8, learning_rate=0.1, colsample_bytree=0.5
    ),
    "Decision Tree": DecisionTreeClassifier(
        splitter="random", max_features="sqrt", max_depth=4, criterion="log_loss"
    ),
}

for model_name, model in models.items():
    print(f"Evaluating model: {model_name}")

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute metrics
    metrics = compute_metrics(y_train, y_train_pred, y_test, y_test_pred)

    # Print metrics
    print("Model performance for Training set")
    for metric_name, value in metrics["Training Set"].items():
        print(f"- {metric_name}: {value:.4f}")

    print("----------------------------------")

    print("Model performance for Test set")
    for metric_name, value in metrics["Test Set"].items():
        print(f"- {metric_name}: {value:.4f}")

    print("=" * 35)
    print("\n")

Evaluating model: Random Forest
Model performance for Training set
- Accuracy: 0.9994
- F1 Score: 0.9994
- Precision: 0.9994
- Recall: 0.9994
- Roc Auc Score: 0.9985
----------------------------------
Model performance for Test set
- Accuracy: 0.9250
- F1 Score: 0.9195
- Precision: 0.9254
- Recall: 0.9250
- Roc Auc Score: 0.8175


Evaluating model: Xgboost
Model performance for Training set
- Accuracy: 1.0000
- F1 Score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9421
- F1 Score: 0.9390
- Precision: 0.9423
- Recall: 0.9421
- Roc Auc Score: 0.8604


Evaluating model: Decision Tree
Model performance for Training set
- Accuracy: 0.8120
- F1 Score: 0.7287
- Precision: 0.8474
- Recall: 0.8120
- Roc Auc Score: 0.5023
----------------------------------
Model performance for Test set
- Accuracy: 0.8125
- F1 Score: 0.7304
- Precision: 0.7238
- Recall: 0.8125
- Roc Auc Score: 0.5010




In [19]:
xgboost_classfier = XGBClassifier(
    n_estimators=300, max_depth=8, learning_rate=0.1, colsample_bytree=0.5
)
xgboost_classfier.fit(X_train, y_train)

In [20]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    trained_model_name: str
    preprocessor_model_path: Path
    model_name: str
    model_params: dict
    target_column: str

In [21]:
from src.constants import *
from src.constants import MODEL_CONFIG_FILE_PATH
from src.utils.common import read_yaml, create_directories


In [22]:
yaml_path = Path("./model_config.yaml")
yaml_data = read_yaml(yaml_path)

print(yaml_data)

[2024-09-09 01:15:18,708: INFO: common: yaml file: model_config.yaml loaded successfully]
{'model': 'Xgboost', 'params': {'n_estimators': 300, 'max_depth': 12, 'learning_rate': 0.1, 'colsample_bytree': 0.5}, 'supported_models': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boost', 'Adaboost', 'Xgboost', 'SVC', 'Gaussian Naive Bayes']}


In [23]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        model_config_filepath=MODEL_CONFIG_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH,
    ):

        self.config = read_yaml(config_filepath)
        self.model_config = read_yaml(model_config_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        model_name = self.model_config.model
        params = self.model_config.params
        schema = self.schema.TARGET_COLUMN
        
        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            trained_model_name=config.trained_model_name,
            preprocessor_model_path = config.preprocessor_model_path,
            model_name=model_name,
            model_params=params,
            target_column=schema.name,
        )

        return model_trainer_config

In [24]:
from src.logger import logger
import joblib

In [25]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        models = {
            "Logistic Regression": LogisticRegression,
            "Decision Tree": DecisionTreeClassifier,
            "Random Forest": RandomForestClassifier,
            "Gradient Boost": GradientBoostingClassifier,
            "Adaboost": AdaBoostClassifier,
            "Xgboost": XGBClassifier,
            "SVC": SVC,
            "Gaussian Naive Bayes": GaussianNB,
        }

        # Load training data
        train_data = pd.read_csv(self.config.train_data_path)
        X_train = train_data.drop([self.config.target_column], axis=1)
        y_train = train_data[[self.config.target_column]]

        # Load preprocessor and transform data
        preprocessor = joblib.load(self.config.preprocessor_model_path)
        X_train = preprocessor.transform(X_train)

        # Load model parameters and initialize model
        model_name = self.config.model_name
        if model_name not in models:
            raise ValueError(f"Model '{model_name}' is not supported.")

        model_class = models[model_name]
        model_params = self.config.model_params

        # Instantiate the model with parameters
        prod_model = model_class(**model_params)

        # Fit the model
        prod_model.fit(X_train, y_train)

        # Save the trained model
        joblib.dump(
            prod_model,
            os.path.join(self.config.root_dir, self.config.trained_model_name),
        )

In [26]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-09-09 01:15:18,815: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-09 01:15:18,818: INFO: common: yaml file: model_config.yaml loaded successfully]
[2024-09-09 01:15:18,822: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-09 01:15:18,823: INFO: common: created directory at: artifacts]
[2024-09-09 01:15:18,824: INFO: common: created directory at: artifacts/model_trainer]
