In [2]:
# set the path
import os 
path= os.getcwd()

if path.endswith('notebooks'):
    os.chdir("../")

In [5]:
# import dependencies
import mlflow
import numpy as np 
import pandas as pd 
import joblib
from pathlib import Path
from src.Home_Premium_Prediction.utils import create_directories, read_yaml
from src.Home_Premium_Prediction.constants import CONFIG_FILE_PATH
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor

class ModelTrainingConfig:
    def __init__(self, model_training_dir: Path, train_data: Path, test_data: Path,
                 model_path: Path, preprocessor_path: Path):
        self.model_training_dir = model_training_dir
        self.train_data = train_data
        self.test_data = test_data
        self.model_path = model_path
        self.preprocessor_path = preprocessor_path



class ModelTrainingConfigManager:
    def __init__(self, config_file=CONFIG_FILE_PATH):
        self.config_file = read_yaml(config_file)

    def get_model_training_config(self) -> ModelTrainingConfig:
        create_directories([self.config_file['model_training']['model_training_dir']])

        return ModelTrainingConfig(
            model_training_dir=self.config_file['model_training']['model_training_dir'],
            train_data=self.config_file['model_training']['train_data'],
            test_data=self.config_file['model_training']['test_data'],
            preprocessor_path=self.config_file['model_training']['preprocessor_path'],
            model_path=self.config_file['model_training']['model_path']
        )



class ModelTraining:

    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        
        # Define base models
        self.base_models = [
            ('catboost', CatBoostRegressor(verbose=0, iterations=50, thread_count=-1)),
            ('linear', LinearRegression())
        ]
        
        # Stacked model
        self.__stacked_model()

    def __stacked_model(self):
        self.stacked_model = StackingRegressor(
            estimators=self.base_models,
            final_estimator=LinearRegression()
        )

    def read_data(self):
        # Load raw data
        raw_train = pd.read_csv(self.config.train_data)
        raw_test = pd.read_csv(self.config.test_data)

        # Separate target column
        self.y_train = raw_train["Premium"]
        self.X_train = raw_train.drop(columns=["Premium"])

        self.y_test = raw_test["Premium"]
        self.X_test = raw_test.drop(columns=["Premium"])

        # Load preprocessor and transform
        preprocessor = joblib.load(self.config.preprocessor_path)
        self.X_train = preprocessor.transform(self.X_train)
        self.X_test = preprocessor.transform(self.X_test)


    def train_model(self):
        # Train the stacked model
        self.stacked_model.fit(self.X_train, self.y_train)

        # Make predictions on the test set
        self.y_pred = self.stacked_model.predict(self.X_test)

        # Calculate performance metrics
        mse = mean_squared_error(self.y_test, self.y_pred)
        mae = mean_absolute_error(self.y_test, self.y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(self.y_test, self.y_pred)

        # Print performance metrics
        print(f"Model MSE: {mse}")
        print(f"Model MAE: {mae}")
        print(f"Model RMSE: {rmse}")
        print(f"Model R²: {r2}")

        # Save the trained model
        joblib.dump(self.stacked_model, self.config.model_path)
        print(f"Model saved to {self.config.model_path}")

        return mse, mae, rmse, r2

    def log_to_mlflow(self, mse, mae, rmse, r2):
        # Set MLflow experiment and log parameters, metrics, and model
        mlflow.set_experiment("home_premium_prediction")

        with mlflow.start_run():
            mlflow.log_param("base_models", [model[0] for model in self.base_models])
            mlflow.log_param("final_estimator", "LinearRegression")
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.sklearn.log_model(self.stacked_model, "stacked_model")
            mlflow.log_artifact(self.config.model_training_dir)

    def run(self):
        # Read the data
        self.read_data()
        
        # Train the model and get the metrics
        mse, mae, rmse, r2 = self.train_model()
        
        # Log the model and metrics to MLflow
        self.log_to_mlflow(mse, mae, rmse, r2)


# The main entry point for the script
if __name__ == "__main__":
    try:
        # Initialize the config manager and retrieve config
        config_manager = ModelTrainingConfigManager(config_file=CONFIG_FILE_PATH)
        config = config_manager.get_model_training_config()

        # Initialize and run the model training
        model_trainer = ModelTraining(config=config)
        model_trainer.run()
    except Exception as e:
        print(e)

created directory at: artifacts/model_training




Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
