In [1]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
import logging
os.chdir(r'c:\Users\ermias.tadesse\10x\Rossmann-Sales-Forecasting-ML\log')
# Configure logging
logging.basicConfig(filename='store_sales.log', 
                    level=logging.INFO, 
                    format='%(asctime)s:%(levelname)s:%(message)s')
print(os.getcwd())  # This prints the current working directory
os.chdir(r'c:\Users\ermias.tadesse\10x\Rossmann-Sales-Forecasting-ML')  # Set the working directory to the project root
from src.data_loader import DataLoader
os.chdir(r'c:\Users\ermias.tadesse\10x\Rossmann-Sales-Forecasting-ML')  # Set the working directory to the project root

c:\Users\ermias.tadesse\10x\Rossmann-Sales-Forecasting-ML\log


In [2]:
data_loader = DataLoader(data_path='data')
train_df, test_df, store_df, sample_submission_df = data_loader.load_data()
logging.info("Loading train_data, test_data, store_data, and sample_submission_data.")

  self.train = pd.read_csv(f"{self.data_path}/train.csv")


Data Loaded Successfully


## Preprocessing

##### Extract features from the Date column.
##### Handle categorical columns.
##### Handle missing values.
##### Scale the data.

In [3]:
# Convert the 'Date' column to datetime
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

# Feature engineering from 'Date' column
train_df['Year'] = train_df['Date'].dt.year
train_df['Month'] = train_df['Date'].dt.month
train_df['Day'] = train_df['Date'].dt.day
train_df['DayOfWeek'] = train_df['Date'].dt.dayofweek
train_df['IsWeekend'] = train_df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

test_df['Year'] = test_df['Date'].dt.year
test_df['Month'] = test_df['Date'].dt.month
test_df['Day'] = test_df['Date'].dt.day
test_df['DayOfWeek'] = test_df['Date'].dt.dayofweek
test_df['IsWeekend'] = test_df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Example additional feature: 'IsBeginningOfMonth'
train_df['IsBeginningOfMonth'] = train_df['Day'].apply(lambda x: 1 if x <= 10 else 0)
test_df['IsBeginningOfMonth'] = test_df['Day'].apply(lambda x: 1 if x <= 10 else 0)

# Handle missing values in 'Open' column for the test data
test_df['Open'].fillna(1, inplace=True)  

# Select features to include in the model
numeric_features = ['Day', 'Month', 'Year', 'Customers', 'Promo', 'IsWeekend', 'IsBeginningOfMonth']
categorical_features = ['Store', 'DayOfWeek', 'StateHoliday', 'SchoolHoliday']

# Define the target column for the train data
target_column = 'Sales'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Open'].fillna(1, inplace=True)


### Preprocessing
Convert the Date column to datetime format.
Extract new features from the Date column, such as Year, Month, Day, IsWeekend, and IsBeginningOfMonth.
Handle missing values in the Open column for the test dataset.
Prepare lists of numeric and categorical features.

## Building Models with sklearn Pipelines
Now we'll set up a pipeline that includes:

##### Imputation for missing values.
##### Scaling for numerical features.
##### One-hot encoding for categorical features.
##### A RandomForest Regressor model.

In [7]:
# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the train data into features and target
X_train = train_df[numeric_features + categorical_features]
y_train = train_df[target_column]

# Convert categorical features to strings to ensure uniform data types
X_train.loc[:, categorical_features] = X_train[categorical_features].astype(str)

# Fit the model
model_pipeline.fit(X_train, y_train)

# Prepare the test data (features only)
X_test = test_df[numeric_features + categorical_features]

# Make predictions on the test data
predictions = model_pipeline.predict(X_test)

# Check the predictions
print(predictions)


  X_train.loc[:, categorical_features] = X_train[categorical_features].astype(str)
  X_train.loc[:, categorical_features] = X_train[categorical_features].astype(str)
  X_train.loc[:, categorical_features] = X_train[categorical_features].astype(str)


### Sklearn Pipeline
A pipeline is created to preprocess both numeric and categorical features.
I define a RandomForestRegressor model to fit the preprocessed data.
The pipeline is used to fit the model on the training data and make predictions on the test data.