In [1]:
import os
os.chdir("../")

In [2]:
!pip install imbalanced-learn --q


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from src.utils.utils import *
from src.constants import *
from dataclasses import dataclass
from pathlib import Path
from src.logging import logging

In [4]:
read_yaml(PARAMS_FILE_PATH).model_trainer

[2025-08-06 15:03:34,436: INFO: utils: yaml file: params.yaml loaded successfully]


ConfigBox({'n_estimators': 200, 'class_weight': 'balanced', 'random_state': 42})

In [19]:
@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_file: Path
    train_dir: Path
    test_dir: Path
    model_save_dir: Path

In [20]:
class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH, params_path=PARAMS_FILE_PATH):
        self.config_path = read_yaml(config_path)
        self.params_path = read_yaml(config_path)
        
        create_directories([self.config_path.artifacts_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config_path.model_trainer
        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_file=config.data_file,
            train_dir=config.train_dir,
            test_dir=config.test_dir,
            model_save_dir=config.model_save_dir
        )

        return model_trainer_config


In [21]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, params_path=PARAMS_FILE_PATH):
        self.config = config
        self.params = read_yaml(params_path).model_trainer
    
    def _resample_data(self, X_train, y_train):
        smote = SMOTE(random_state=self.params["random_state"])
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        return X_train_resampled, y_train_resampled
    
    def _evaluate_model(self, y_true, y_pred):
        try:
            accuracy = accuracy_score(y_true, y_pred)
            f1score = f1_score(y_true, y_pred)
            precision = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            metrics = {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "f1score": f1score,
            }
            return metrics
        except Exception as e:
            raise e
    
    def train(self, train_df: pd.DataFrame, test_df: pd.DataFrame):
        X_train, y_train = train_df.drop("y", axis=1), train_df["y"]
        X_test, y_test = test_df.drop("y", axis=1), test_df["y"]

        X_train, y_train = self._resample_data(X_train, y_train)
        model = RandomForestClassifier(n_estimators=self.params["n_estimators"],
                                       random_state=self.params["random_state"],
                                       class_weight=self.params["class_weight"])
        logging.info("Model training started")
        model.fit(X_train, y_train)
        logging.info("Model training completed")
        y_pred = model.predict(X_test)
        metrics = self._evaluate_model(y_test, y_pred)
        logging.info("Evaluated model")
        #print(metrics)

        save_path = Path(self.config.model_save_dir)
        os.makedirs(save_path.parent, exist_ok=True)
        save_pickle(model, save_path)
        

In [8]:
df = pd.read_csv("artifacts/data_transformation/data/processed_data.csv")
train_df = pd.read_csv("artifacts/data_transformation/data/train.csv")
test_df = pd.read_csv("artifacts/data_transformation/data/test.csv")

In [9]:
len(df), len(train_df), len(test_df), len(train_df) + len(test_df)

(45211, 36168, 9043, 45211)

In [10]:
X_train, y_train = train_df.drop("y", axis=1), train_df["y"]
X_test, y_test = test_df.drop("y", axis=1), test_df["y"]

In [11]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [12]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
accuracy_score(y_test, y_pred)

0.8957204467543957

In [15]:
f1_score(y_test, y_pred)

0.5858585858585859

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      7985
           1       0.55      0.63      0.59      1058

    accuracy                           0.90      9043
   macro avg       0.75      0.78      0.76      9043
weighted avg       0.90      0.90      0.90      9043



In [17]:
print(confusion_matrix(y_test, y_pred))

[[7433  552]
 [ 391  667]]


In [22]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(model_trainer_config)
model_trainer.train(train_df, test_df)

[2025-08-06 15:05:21,157: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2025-08-06 15:05:21,160: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2025-08-06 15:05:21,161: INFO: utils: Created Directory at: artifacts]
[2025-08-06 15:05:21,162: INFO: utils: Created Directory at: artifacts/model_trainer]
[2025-08-06 15:05:21,164: INFO: utils: yaml file: params.yaml loaded successfully]
[2025-08-06 15:05:21,275: INFO: 678424577: Model training started]
[2025-08-06 15:05:36,303: INFO: 678424577: Model training completed]
[2025-08-06 15:05:36,567: INFO: 678424577: Evaluated model]
[2025-08-06 15:05:36,721: INFO: utils: Model saved at file path: artifacts\model_trainer\model\model.pkl]
