In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import resample
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

class PostPredictionAnalysis:
    def __init__(self, data_path, loss_function='mse'):
        """
        Initializes the PostPredictionAnalysis with the dataset and loss function.
        
        :param data_path: The path to the dataset.
        :param loss_function: The loss function to use for model evaluation. Options are 'mse' or 'mae'.
        """
        self.data_path = data_path
        self.df = None
        self.model = None
        self.loss_function = loss_function
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_importances = None
        self.predictions = None
        self.confidence_intervals = None

    def load_data(self):
        """ Loads the dataset. """
        try:
            logger.info(f"Loading data from {self.data_path}...")
            self.df = pd.read_csv(self.data_path)
            logger.info("Data loaded successfully!")
        except Exception as e:
            logger.error(f"Error loading data: {e}")

    def preprocess_data(self):
        """ Preprocess the data: handle missing values, encode categorical features, etc. """
        logger.info("Preprocessing data...")
        self.df.fillna(0, inplace=True)  # Simple imputation
        X = self.df.drop(columns=['Sales', 'Date'])  # Features
        y = self.df['Sales']  # Target
        
        # One-hot encoding for categorical variables
        X = pd.get_dummies(X, drop_first=True)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        logger.info("Data preprocessing completed.")

    def train_model(self):
        """ Train the regression model using Random Forest Regressor. """
        logger.info("Training the model...")
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.model.fit(self.X_train, self.y_train)
        logger.info("Model trained successfully.")

    def evaluate_model(self):
        """ Evaluate the model using the selected loss function. """
        logger.info(f"Evaluating the model using {self.loss_function}...")

        # Predictions
        y_pred = self.model.predict(self.X_test)
        self.predictions = y_pred

        if self.loss_function == 'mse':
            loss = mean_squared_error(self.y_test, y_pred)
            logger.info(f"Mean Squared Error: {loss}")
        elif self.loss_function == 'mae':
            loss = mean_absolute_error(self.y_test, y_pred)
            logger.info(f"Mean Absolute Error: {loss}")
        else:
            logger.error("Invalid loss function. Use 'mse' or 'mae'.")

    def plot_feature_importance(self):
        """ Plot feature importance from the trained model """
        logger.info("Plotting feature importance...")
        if self.model is not None:
            self.feature_importances = self.model.feature_importances_
            feature_importance_df = pd.DataFrame({
                'Feature': self.df.drop(columns=['Sales', 'Date']).columns,
                'Importance': self.feature_importances
            })
            feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

            plt.figure(figsize=(10, 6))
            plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
            plt.xlabel('Importance')
            plt.title('Feature Importance')
            plt.show()
        else:
            logger.error("Model is not trained. Please train the model first.")

    def estimate_confidence_interval(self, n_bootstrap=1000, alpha=0.05):
        """ Estimate confidence intervals for predictions using bootstrap sampling """
        logger.info("Estimating confidence intervals...")
        bootstrap_predictions = []
        for _ in range(n_bootstrap):
            X_resampled, y_resampled = resample(self.X_train, self.y_train, random_state=42)
            self.model.fit(X_resampled, y_resampled)
            bootstrap_predictions.append(self.model.predict(self.X_train))

        # Calculate confidence intervals (lower and upper bounds)
        bootstrap_predictions = np.array(bootstrap_predictions)
        lower_bound = np.percentile(bootstrap_predictions, alpha/2 * 100, axis=0)
        upper_bound = np.percentile(bootstrap_predictions, (1 - alpha/2) * 100, axis=0)

        self.confidence_intervals = pd.DataFrame({
            'Prediction': self.predictions,
            'Lower Bound': lower_bound,
            'Upper Bound': upper_bound
        })
        
        logger.info(f"Confidence intervals estimated at the {100*(1-alpha)}% confidence level.")

    def plot_confidence_intervals(self):
        """ Plot the confidence intervals of the predictions """
        logger.info("Plotting confidence intervals...")
        if self.confidence_intervals is not None:
            plt.figure(figsize=(10, 6))
            plt.plot(self.confidence_intervals['Prediction'], label='Predictions', color='blue')
            plt.fill_between(range(len(self.confidence_intervals)), 
                             self.confidence_intervals['Lower Bound'], 
                             self.confidence_intervals['Upper Bound'], 
                             color='gray', alpha=0.3, label='Confidence Interval')
            plt.title('Predictions with Confidence Intervals')
            plt.xlabel('Index')
            plt.ylabel('Sales')
            plt.legend()
            plt.show()
        else:
            logger.error("Confidence intervals not estimated. Please run estimate_confidence_interval first.")


# Run the model pipeline
if __name__ == "__main__":
    data_path = os.path.abspath("../cleaned_data/primary_data.csv")  # Update with your path
    model = PostPredictionAnalysis(data_path, loss_function='mse')  # You can choose 'mae' here as well

    # Execute the model steps
    model.load_data()
    model.preprocess_data()
    model.train_model()
    model.evaluate_model()
    model.plot_feature_importance()
    model.estimate_confidence_interval()
    model.plot_confidence_intervals()

2025-01-10 20:52:08,417 - INFO - Loading data from /mnt/c/Users/Nas/Contacts/Desktop/AIM/kaim-week-4/kaim-week-4/cleaned_data/primary_data.csv...
2025-01-10 20:52:15,024 - INFO - Data loaded successfully!
2025-01-10 20:52:15,025 - INFO - Preprocessing data...
2025-01-10 20:52:16,287 - INFO - Data preprocessing completed.
2025-01-10 20:52:16,290 - INFO - Training the model...
