In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import sys
import os
import joblib
from datetime import datetime


In [2]:
pd.set_option('display.max_columns', None)
df_train = pd.read_csv(r"C:\Users\ephre\Documents\Rossmann-Pharmaceuticals-Finance-1\Data\Preprocessed_Data\preprocessed_dataset.csv")
df_test = pd.read_csv(r"C:\Users\ephre\Documents\Rossmann-Pharmaceuticals-Finance-1\Data\Test_Data\cleaned_Test_dataset.csv")

In [3]:
df_train.sample(5)

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,IsHoliday,Trend,Seasonal,Sales_MA,DuringPromo,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Month,WeekOfYear,DayOfMonth,IsWeekend,HolidayImpact,Sales_Lag_1,Sales_Lag_7,PromoDuration,PromoEffectiveness,SalesPerCustomer,Customer_Lag_1,Customer_Lag_7,CompetitorDistanceInteraction
178841,601,-1.000476,-0.638407,-0.762569,1,0,0,0,0,-0.21754,-1.527313,-0.325406,0,3,0,0.36317,-1.013639,0.817038,1,-0.86412,-0.740722,1,-1.457193,-1.359039,0.489007,0,0,-0.426976,-0.72102,-1.008237,-0.638407,1.08837,-0.37283,-0.56448,-0.334661
318742,718,-1.501129,1.060328,0.654278,1,1,0,1,0,1.264749,-1.460699,1.174509,1,0,0,-0.560541,-0.386184,0.967824,0,-0.06357,0.104612,1,0.346724,0.511626,1.399377,0,1,0.456938,1.109933,-0.26936,1.060328,0.466467,0.062129,0.755474,-0.061193
370107,1063,-0.499823,-0.39113,-0.196261,1,0,0,0,0,0.236895,-0.407239,0.41652,0,0,2,0.107592,0.554999,0.515467,0,-0.06357,0.104612,0,-0.254581,-0.111929,1.399377,0,0,0.325247,-0.251658,-0.072326,-0.39113,-0.736785,0.057823,-0.402985,-0.325729
669711,1038,-0.499823,-0.195801,-0.372829,1,0,0,0,0,-0.108278,-0.922839,0.071698,0,3,0,1.539862,0.868726,0.666253,0,-0.06357,0.104612,1,0.346724,0.303774,-0.648956,0,0,0.101606,-0.543613,0.331594,-0.195801,0.645929,0.45187,-0.661377,-0.338275
330490,230,1.002138,0.344989,0.626285,1,0,0,0,0,-0.047347,1.852188,-0.030102,0,3,2,0.163378,0.554999,0.666253,0,-0.06357,0.104612,1,0.346724,0.303774,-0.421364,1,0,0.43408,-0.153473,-0.220101,0.344989,-0.822515,0.14826,-0.170432,-0.313575


In [4]:
# Split features (X) and target (y) from training data
X_train = df_train.drop(columns=['Sales'])
y_train = df_train['Sales']

# In the test dataset, exclude 'Sales' (since it's not available)
X_test = df_test.drop(columns=['Id'])


In [5]:
# Build the pipeline
pipeline = Pipeline([ 
    ('model', RandomForestRegressor())    # Random Forest for Regression
])

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on validation data
y_pred = pipeline.predict(X_val)


In [6]:
# Evaluate the model performance
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 5.520312788022022e-06
R-squared: 0.9999914483474794


In [7]:
# Define parameter grid
param_grid = {
    'model__n_estimators': [100, 125],  # Number of trees
    'model__max_depth': [5, 10]        # Depth of trees
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best model after tuning
best_pipeline = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

# Evaluate best model on validation set
y_pred_best = best_pipeline.predict(X_val)
mse_best = mean_squared_error(y_val, y_pred_best)
r2_best = r2_score(y_val, y_pred_best)

print(f"Best Model Mean Squared Error: {mse_best}")
print(f"Best Model R-squared: {r2_best}")


In [None]:
# Make predictions on the test set
y_test_pred = pipeline.predict(X_test)

# Add predictions to the test dataset
df_test['Sales_Predicted'] = y_test_pred

# Save the predictions to CSV
df_test.to_csv(r"C:\Users\ephre\Documents\Rossmann-Pharmaceuticals-Finance-1\Predictions\predicted_sales.csv", index=False)


In [None]:
timestamp = datetime.now().strftime("%d-%m-%Y-%H-%M-%S-%f")[:-3] 

In [None]:
# Define the folder where the models will be saved
model_dir = r"C:\Users\ephre\Documents\Rossmann-Pharmaceuticals-Finance-1\Models"
os.makedirs(model_dir, exist_ok=True)

# Model filename with timestamp
model_filename = f"random_forest_model_{timestamp}.pkl"

# Train the pipeline (assuming it is already built as shown earlier)
pipeline.fit(X_train, y_train)

# Serialize and save the model
joblib.dump(pipeline, os.path.join(model_dir, model_filename))

print(f"Model saved as {model_filename}")
