In [1]:
import os

In [2]:
%pwd
os.chdir("..")

In [3]:
%pwd

'c:\\Users\\user\\Desktop\\End-to-End-ML-project-MLflow'

In [4]:
os.chdir(r"C:\Users\user\Desktop\End-to-End-ML-project-MLflow")


In [10]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    trained_data_path: Path
    test_data_path: Path
    model_name: str
    params: dict          # ✅ hyperparameter grid
    target_column: str
    evaluation_metric: str = "r2"   # ✅ default metric


In [11]:
from my_project.constants import *
from my_project.utils.common import read_yaml, create_directories
from pathlib import Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH,
        schema_filepath: Path = SCHEMA_FILE_PATH,
    ):
        # Load configs
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Ensure artifacts root exists
        create_directories([Path(self.config.artifacts_root)])

    def get_model_trainer_configs(self) -> dict[str, ModelTrainerConfig]:
        """
        Returns a dictionary of model_name -> ModelTrainerConfig
        """
        config = self.config.model_trainer
        schema = self.schema.target_column
        eval_metric = self.params.model_evaluation.evaluation_metric

        configs = {}
        create_directories([Path(config.root_dir)])

        for model_name, model_params in self.params.models.items():
            model_trainer_config = ModelTrainerConfig(
                root_dir=Path(config.root_dir),
                trained_data_path=Path(config.trained_data_path),
                test_data_path=Path(config.test_data_path),
                model_name=model_name,
                params=model_params,
                target_column=schema,
                evaluation_metric=eval_metric,
            )
            configs[model_name] = model_trainer_config

        return configs


In [12]:
import pandas as pd
import os
import json
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib
from my_project import logging
from functools import reduce
from operator import mul

class ModelTrainer:
    def __init__(self, config):
        self.config = config

        # Candidate models
        self.models = {
            "elasticnet": ElasticNet(),
            "randomforest": RandomForestRegressor(),
            "xgbregressor": XGBRegressor(objective="reg:squarederror"),
        }

        # Available metrics
        self.metrics = {
            "r2": r2_score,
            "mse": mean_squared_error,
            "mae": mean_absolute_error,
        }

    def _count_total_param_combinations(self, params):
        sizes = [len(v) for v in params.values()]
        return reduce(mul, sizes, 1)

    def tune_model(self, model, params, x_train, y_train, scoring, n_iter=20):
        """
        Use RandomizedSearchCV for faster tuning.
        n_iter = number of random combinations (dynamic)
        """
        total_combinations = self._count_total_param_combinations(params)
        n_iter = min(n_iter, total_combinations)  # avoid warning

        search = RandomizedSearchCV(
            model,
            params,
            n_iter=n_iter,
            cv=3,
            scoring=scoring,
            n_jobs=-1,
            random_state=42,
            error_score="raise"
        )
        search.fit(x_train, y_train)
        return search.best_estimator_, search.best_params_

    def initiate_model_trainer(self):
        logging.info("Loading training and test data")
        train_df = pd.read_csv(self.config.trained_data_path)
        test_df = pd.read_csv(self.config.test_data_path)

        target_column = self.config.target_column
        x_train, y_train = train_df.drop(columns=[target_column]), train_df[target_column]
        x_test, y_test = test_df.drop(columns=[target_column]), test_df[target_column]

        logging.info("Training and tuning models")
        params = self.config.params
        model = self.models[self.config.model_name]

        # Tune model
        tuned_model, tuned_params = self.tune_model(
            model, params, x_train, y_train,
            scoring=self.config.evaluation_metric,
            n_iter=30
        )

        # Make predictions and evaluate
        preds = tuned_model.predict(x_test)
        metric_fn = self.metrics.get(self.config.evaluation_metric, r2_score)
        score = metric_fn(y_test, preds)

        logging.info(f"{self.config.model_name} -> Score: {score:.4f}, Best Params: {tuned_params}")

        # Save model
        model_path = os.path.join(self.config.root_dir, f"{self.config.model_name}.pkl")
        joblib.dump(tuned_model, model_path)

        # Save best params
        params_file = os.path.join(self.config.root_dir, f"{self.config.model_name}_best_params.json")
        with open(params_file, "w") as f:
            json.dump({"model_name": self.config.model_name, **tuned_params}, f, indent=4)

        logging.info(f"Model saved at: {model_path}")
        logging.info(f"Params saved at: {params_file}")

        return score


In [13]:
try:
    config = ConfigurationManager()
    model_trainer_configs = config.get_model_trainer_configs()

    results = []
    for model_name, trainer_config in model_trainer_configs.items():
        try:
            model_trainer = ModelTrainer(config=trainer_config)
            score = model_trainer.initiate_model_trainer()
            results.append((model_name, score))
        except Exception as e:
            print(f"⚠️ Skipping {model_name} due to error: {e}")
            continue

    # Print summary
    print("\n📊 Model Results:")
    for name, score in results:
        print(f"{name:15} -> {score:.4f}")

    # Pick best
    best_model, best_score = max(results, key=lambda x: x[1])
    print(f"\n✅ Best model: {best_model} with score {best_score:.4f}")

except Exception as e:
    raise e


2025-09-05 11:54:33,944 - my_project - INFO - YAML file 'config\config.yaml' read successfully.
2025-09-05 11:54:33,963 - my_project - INFO - YAML file 'params.yaml' read successfully.
2025-09-05 11:54:33,976 - my_project - INFO - YAML file 'schema.yaml' read successfully.
2025-09-05 11:54:33,982 - my_project - INFO - Directory created: 'artifacts'
2025-09-05 11:54:33,982 - my_project - INFO - Directory created: 'artifacts\model_trainer'
2025-09-05 11:54:33,995 - root - INFO - Loading training and test data


2025-09-05 11:54:34,090 - root - INFO - Training and tuning models
2025-09-05 11:54:47,514 - root - INFO - elasticnet -> Score: 0.4139, Best Params: {'random_state': 42, 'l1_ratio': 0.1, 'alpha': 0.001}
2025-09-05 11:54:47,534 - root - INFO - Model saved at: artifacts\model_trainer\elasticnet.pkl
2025-09-05 11:54:47,541 - root - INFO - Params saved at: artifacts\model_trainer\elasticnet_best_params.json
2025-09-05 11:54:47,545 - root - INFO - Loading training and test data
2025-09-05 11:54:47,582 - root - INFO - Training and tuning models
2025-09-05 11:55:23,821 - root - INFO - randomforest -> Score: 0.5315, Best Params: {'random_state': 42, 'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}
2025-09-05 11:55:24,327 - root - INFO - Model saved at: artifacts\model_trainer\randomforest.pkl
2025-09-05 11:55:24,331 - root - INFO - Params saved at: artifacts\model_trainer\randomforest_best_params.json
2025-09-05 11:55:24,331 - root - 