In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\premium-price-prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ambig\\jupiter_notebook\\Projects\\premium-price-prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class ModelEvaluationConfig:
    root_dir : Path
    test_filepath : Path
    train_filepath : Path
    model_path : Path
    

In [6]:
from src.Premium_Price_Prediction.constants import *
from src.Premium_Price_Prediction.utils.common import read_yaml , create_directories,save_object
import warnings
warnings.filterwarnings("ignore")

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    def get_model_evaluation(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        create_directories([config.root_dir])
        
        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            train_filepath=config.train_filepath,
            test_filepath=config.test_filepath,
            model_path=config.model_path,
        )
        return model_evaluation_config


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from typing import Dict
from src.Premium_Price_Prediction import logger
from joblib import dump
import os

class ModelEvaluation():
    def __init__(self, config: ModelEvaluationConfig):
        self.train_filepath = config.train_filepath
        self.test_filepath = config.test_filepath
        self.target_column = 'annual_premium_amount'
        self.model_path = config.model_path
        logger.info("ModelEvaluation initialized with train filepath: %s, test filepath: %s", self.train_filepath, self.test_filepath)

    def evaluate_models(self, X_train, y_train, X_test, y_test, models, param):
        """Evaluate multiple regression models and return their R2 scores."""
        try:
            report = {}
            logger.info("Starting model evaluation for %d models.", len(models))

            for model_name, model in models.items():
                logger.info("Evaluating model: %s", model_name)
                para = param[model_name]

                gs = RandomizedSearchCV(model, para, cv=3)
                gs.fit(X_train, y_train)

                model.set_params(**gs.best_params_)
                model.fit(X_train, y_train)

                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)

                train_model_score = r2_score(y_train, y_train_pred)
                test_model_score = r2_score(y_test, y_test_pred)

                report[model_name] = test_model_score
                logger.info(f"{model_name} - Train R2: {train_model_score:.4f}, Test R2: {test_model_score:.4f}")

            logger.info("Model evaluation completed.")
            return report

        except Exception as e:
            logger.error(f"Model Evaluation Error: {str(e)}")
            raise

    def evaluate_model(self):
        """Load data, train models, and evaluate their performance."""
        try:
            logger.info("Loading training and testing data.")
            # Load train and test data
            train_df = pd.read_csv(self.train_filepath)
            test_df = pd.read_csv(self.test_filepath)

            logger.info("Combining training and testing data.")
            # Combine train and test data
            df = pd.concat([train_df, test_df], axis=0)

            # Sample data if necessary (consider the dataset size)
            df = df.sample(10)  # Be cautious with sampling; adjust as necessary

            # Split into features and target
            X = df.drop(self.target_column, axis=1)
            y = df[self.target_column]

            logger.info("Splitting data into training and testing sets.")
            # Split data into train and test sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Define models and hyperparameters
            models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }

            params = {
                "Decision Tree": {
                    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                },
                "Random Forest": {
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                },
                "Gradient Boosting": {
                    'learning_rate': [.1, .01, .05, .001],
                    'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                },
                "Linear Regression": {},
                "XGBRegressor": {
                    'learning_rate': [.1, .01, .05, .001],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                },
                "AdaBoost Regressor": {
                    'learning_rate': [.1, .01, 0.5, .001],
                    'n_estimators': [8, 16, 32, 64, 128, 256]
                }
            }

            # Evaluate models and get model report
            model_report = self.evaluate_models(X_train, y_train, X_test, y_test, models, params)

            # Get the best model
            best_model_score = max(model_report.values())
            best_model_name = max(model_report, key=model_report.get)
            best_model = models[best_model_name]

            if best_model_score < 0.6:
                raise ValueError("No suitable model found with R2 score above 0.6")

            logger.info(f"Best model: {best_model_name} with R2 score: {best_model_score:.4f}")

            # Save the best model
            self.save_object(file_path=self.model_path, obj=best_model)

            # Final prediction and R2 score on test set
            predicted = best_model.predict(X_test)
            final_r2 = r2_score(y_test, predicted)
            logger.info(f"Final R2 score on test set: {final_r2:.4f}")
            return final_r2

        except Exception as e:
            logger.error(f"Error in model evaluation: {str(e)}")
            raise

    def save_object(self, file_path, obj):
        """Save the model object to the specified file path."""
        if not os.path.exists(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))
        dump(obj, file_path)
        logger.info(f"Model saved to {file_path}")


In [11]:
config = ConfigurationManager()
model_evalution_config = config.get_model_evaluation()

model_evaluation = ModelEvaluation(model_evalution_config)
model_evaluation.evaluate_model()

[2024-11-04 16:22:39,755: INFO: common: 30] YAML file : config\config.yaml loaded successfully
[2024-11-04 16:22:39,757: INFO: common: 30] YAML file : params.yaml loaded successfully
[2024-11-04 16:22:39,760: INFO: common: 30] YAML file : schema.yaml loaded successfully
[2024-11-04 16:22:39,761: INFO: common: 50] Directory artifacts created successfully.
[2024-11-04 16:22:39,762: INFO: common: 50] Directory artifacts/model_evaluation created successfully.
[2024-11-04 16:22:39,763: INFO: 4070258086: 20] ModelEvaluation initialized with train filepath: artifacts/feature_engineering/train_data.csv, test filepath: artifacts/feature_engineering/test_data.csv
[2024-11-04 16:22:39,763: INFO: 4070258086: 57] Loading training and testing data.
[2024-11-04 16:22:39,944: INFO: 4070258086: 62] Combining training and testing data.
[2024-11-04 16:22:39,962: INFO: 4070258086: 73] Splitting data into training and testing sets.
[2024-11-04 16:22:39,965: INFO: 4070258086: 26] Starting model evaluation f

0.8116206631785703