In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

class ModelBuilder:
    def __init__(self, base_dir, data_filename="primary_data.csv"):
        """
        Initializes the ModelBuilder object with paths for the primary data.
        
        :param base_dir: The base directory where the script is located.
        :param data_filename: The name of the primary data file.
        """
        self.base_dir = os.path.abspath(base_dir)
        self.data_folder = os.path.join(self.base_dir, "../cleaned_data")  # Define cleaned data folder
        self.data_file = os.path.join(self.data_folder, data_filename)  # Path to primary_data.csv
        self.df = None
    
    def load_data(self):
        """ Load the primary data using the defined path. """
        try:
            logger.info(f"Loading data from {self.data_file}...")
            self.df = pd.read_csv(self.data_file)
            logger.info("Data loaded successfully!")
        except Exception as e:
            logger.error(f"Error loading data: {e}")
    
    def preprocess_data(self):
        """ Preprocess data by handling missing values and encoding categorical features if any. """
        if self.df is not None:
            # Drop rows with missing target values (Sales)
            self.df = self.df.dropna(subset=["Sales"])

            # Drop non-numeric or unnecessary columns
            self.df = self.df.drop(columns=["Date", "Store"])  # Drop Date and Store as an example

            # Handle missing values (for simplicity, we'll fill missing numerical values with the mean)
            self.df.fillna(self.df.mean(), inplace=True)

            # Assume 'Sales' is the target column
            self.X = self.df.drop(columns=["Sales"])
            self.y = self.df["Sales"]
            
            logger.info("Data preprocessing completed!")
    
    def build_pipeline(self):
        """ Build a pipeline for the regression model. """
        # Define the steps in the pipeline
        steps = [
            ('scaler', StandardScaler()),  # Standardize features
            ('model', RandomForestRegressor(n_estimators=100, random_state=42))  # Random Forest Regressor
        ]
        
        # Create the pipeline
        pipeline = Pipeline(steps)
        
        return pipeline
    
    def train_and_evaluate(self):
        """ Train the model using the pipeline and evaluate performance. """
        if self.X is not None and self.y is not None:
            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
            
            # Build the pipeline
            pipeline = self.build_pipeline()
            
            # Train the model
            logger.info("Training the model...")
            pipeline.fit(X_train, y_train)
            
            # Make predictions
            y_pred = pipeline.predict(X_test)
            
            # Calculate performance (mean squared error)
            mse = mean_squared_error(y_test, y_pred)
            logger.info(f"Model performance (MSE): {mse:.4f}")
        else:
            logger.error("Data is not preprocessed properly.")
    
    def run(self):
        """ Run the entire process of loading, preprocessing, training, and evaluation. """
        logger.info("Starting model building process...")
        
        self.load_data()
        self.preprocess_data()
        self.train_and_evaluate()
        
        logger.info("Model building process completed.")

# Run the model building process
if __name__ == "__main__":
    base_directory = os.getcwd()
    model_builder = ModelBuilder(base_directory)
    model_builder.run()

2025-01-10 20:32:24,716 - INFO - Starting model building process...
2025-01-10 20:32:24,720 - INFO - Loading data from /mnt/c/Users/Nas/Contacts/Desktop/AIM/kaim-week-4/kaim-week-4/notebooks/../cleaned_data/primary_data.csv...
2025-01-10 20:32:33,613 - INFO - Data loaded successfully!
