In [1]:
import os
import pandas as pd
import joblib
from pathlib import Path
from dataclasses import dataclass
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlProject import logger
from mlProject.utils.common import read_yaml, create_directories
from mlProject.constants import *

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    target_column: str
    n_estimators: int
    max_depth: int
    min_samples_split: int
    min_samples_leaf: int
    max_features: str
    bootstrap: bool
    random_state: int

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        try:
            if hasattr(self.params, 'RandomForest'):
                params = self.params.RandomForest
            elif hasattr(self.params, 'random_forest'):
                params = self.params.random_forest
            elif hasattr(self.params, 'randomforest'):
                params = self.params.randomforest
            else:
                raise ValueError("RandomForest parameters not found in params.yaml")

            schema = self.schema.TARGET_COLUMN

            if not os.path.exists(config.train_data_path):
                raise FileNotFoundError(f"Train data path not found: {config.train_data_path}")
            if not os.path.exists(config.test_data_path):
                raise FileNotFoundError(f"Test data path not found: {config.test_data_path}")

            create_directories([config.root_dir])

            return ModelTrainerConfig(
                root_dir=config.root_dir,
                train_data_path=config.train_data_path,
                test_data_path=config.test_data_path,
                model_name=config.model_name,
                target_column=schema,
                n_estimators=params.n_estimators,
                max_depth=params.max_depth,
                min_samples_split=params.min_samples_split,
                min_samples_leaf=params.min_samples_leaf,
                max_features=params.max_features,
                bootstrap=params.bootstrap,
                random_state=params.random_state
            )
        except Exception as e:
            logger.error(f"Error in getting model trainer config: {e}")
            raise

def encode_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Encodes categorical columns to numeric.
    Maps common binary categories, else applies label encoding.
    """
    for col in df.select_dtypes(include=['object']).columns:
        unique_vals = set(df[col].dropna().unique())
        if unique_vals <= {'M', 'F'}:
            df[col] = df[col].map({'M': 1, 'F': 0})
        elif unique_vals <= {'Y', 'N'}:
            df[col] = df[col].map({'Y': 1, 'N': 0})
        elif unique_vals <= {'Yes', 'No'}:
            df[col] = df[col].map({'Yes': 1, 'No': 0})
        else:
            df[col] = pd.factorize(df[col])[0]
    return df

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def evaluate_model(self, y_true, y_pred):
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

        metrics = {
            "Mean Squared Error": mse,
            "Mean Absolute Error": mae,
            "R2 Score": r2
        }

        logger.info(f"Model Evaluation Metrics: {metrics}")
        return metrics

    def train(self):
        try:
            logger.info("Loading training and test data")
            train_data = pd.read_csv(self.config.train_data_path)
            test_data = pd.read_csv(self.config.test_data_path)

            logger.info("Encoding categorical features")
            train_data = encode_categorical(train_data)
            test_data = encode_categorical(test_data)

            logger.info("Splitting data into features and target")
            X_train = train_data.drop(self.config.target_column, axis=1)
            y_train = train_data[self.config.target_column]
            X_test = test_data.drop(self.config.target_column, axis=1)
            y_test = test_data[self.config.target_column]

            logger.info("Initializing Random Forest model")
            model = RandomForestRegressor(
                n_estimators=self.config.n_estimators,
                max_depth=self.config.max_depth,
                min_samples_split=self.config.min_samples_split,
                min_samples_leaf=self.config.min_samples_leaf,
                max_features=self.config.max_features,
                bootstrap=self.config.bootstrap,
                random_state=self.config.random_state,
                verbose=1
            )

            logger.info("Training model...")
            model.fit(X_train, y_train)
            logger.info("Model training completed")

            logger.info("Evaluating model performance")
            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)

            logger.info("Training set metrics:")
            self.evaluate_model(y_train, train_pred)

            logger.info("Test set metrics:")
            test_metrics = self.evaluate_model(y_test, test_pred)

            model_path = os.path.join(self.config.root_dir, self.config.model_name)
            joblib.dump(model, model_path)
            logger.info(f"Model saved at: {model_path}")

            return test_metrics

        except Exception as e:
            logger.error(f"Error during model training: {e}")
            raise

# Main execution
if __name__ == "__main__":
    try:
        logger.info("Starting model training pipeline")

        config_manager = ConfigurationManager()
        model_trainer_config = config_manager.get_model_trainer_config()

        model_trainer = ModelTrainer(config=model_trainer_config)
        metrics = model_trainer.train()

        logger.info(f"Model training completed successfully with metrics: {metrics}")

    except Exception as e:
        logger.error(f"Error in model training pipeline: {e}")
        raise


  from pandas.core import (


[2025-05-21 14:45:30,486: INFO: 719446690: Starting model training pipeline]
[2025-05-21 14:45:30,591: INFO: common: yaml file: D:\DataScience\Ml flow project\ml-project-withmlflow\config\config.yaml loaded successfully]
[2025-05-21 14:45:30,598: INFO: common: yaml file: D:\DataScience\Ml flow project\ml-project-withmlflow\params.yaml loaded successfully]
[2025-05-21 14:45:30,603: INFO: common: yaml file: D:\DataScience\Ml flow project\ml-project-withmlflow\schema.yaml loaded successfully]
[2025-05-21 14:45:30,610: INFO: common: created directory at: artifacts]
[2025-05-21 14:45:30,611: INFO: common: created directory at: artifacts/model_trainer]
[2025-05-21 14:45:30,613: INFO: 719446690: Loading training and test data]
[2025-05-21 14:45:30,670: INFO: 719446690: Encoding categorical features]
[2025-05-21 14:45:30,687: INFO: 719446690: Splitting data into features and target]
[2025-05-21 14:45:30,693: INFO: 719446690: Initializing Random Forest model]
[2025-05-21 14:45:30,694: INFO: 719

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [2]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-05-21 14:45:31,027: INFO: common: yaml file: D:\DataScience\Ml flow project\ml-project-withmlflow\config\config.yaml loaded successfully]
[2025-05-21 14:45:31,036: INFO: common: yaml file: D:\DataScience\Ml flow project\ml-project-withmlflow\params.yaml loaded successfully]
[2025-05-21 14:45:31,054: INFO: common: yaml file: D:\DataScience\Ml flow project\ml-project-withmlflow\schema.yaml loaded successfully]
[2025-05-21 14:45:31,056: INFO: common: created directory at: artifacts]
[2025-05-21 14:45:31,059: INFO: common: created directory at: artifacts/model_trainer]
[2025-05-21 14:45:31,060: INFO: 719446690: Loading training and test data]
[2025-05-21 14:45:31,073: INFO: 719446690: Encoding categorical features]
[2025-05-21 14:45:31,084: INFO: 719446690: Splitting data into features and target]
[2025-05-21 14:45:31,088: INFO: 719446690: Initializing Random Forest model]
[2025-05-21 14:45:31,088: INFO: 719446690: Training model...]
[2025-05-21 14:45:31,272: INFO: 719446690: Model t

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


[2025-05-21 14:45:31,445: INFO: 719446690: Model saved at: artifacts/model_trainer\model.joblib]
