In [33]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib

In [34]:
# Import util packages
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('..'))
from utils import open_file

In [48]:
# Load train and test data
train_data = open_file('cleaned_train.csv')
test_data = open_file('cleaned_test.csv')

  return pd.read_csv(file_path)


In [49]:
# Display the first 5 rows of the train data
print(train_data.head())

   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263        555     1      1            0   
1      2          5  2015-07-31   6064        625     1      1            0   
2      3          5  2015-07-31   8314        821     1      1            0   
3      4          5  2015-07-31  13995       1498     1      1            0   
4      5          5  2015-07-31   4822        559     1      1            0   

   SchoolHoliday  Year  Month  Day  Weekday  IsWeekend  
0              1  2015      7   31        4      False  
1              1  2015      7   31        4      False  
2              1  2015      7   31        4      False  
3              1  2015      7   31        4      False  
4              1  2015      7   31        4      False  


In [50]:
# load store data
store_data = open_file('rossmann_data/store.csv')

In [51]:
store_data.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [52]:
# Merge Data
data = train_data.merge(store_data, on="Store")

In [53]:
# Fill Missing Values
data["CompetitionDistance"] = data["CompetitionDistance"].fillna(data["CompetitionDistance"].max() * 10)
data = data.fillna(0)

In [61]:
data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,...,IsWeekend,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,...,False,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,2,5,2015-07-31,6064,625,1,1,0,1,2015,...,False,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-31,8314,821,1,1,0,1,2015,...,False,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-31,13995,1498,1,1,0,1,2015,...,False,c,c,620.0,9.0,2009.0,0,0.0,0.0,0
4,5,5,2015-07-31,4822,559,1,1,0,1,2015,...,False,a,a,29910.0,4.0,2015.0,0,0.0,0.0,0


In [54]:
# Feature Engineering
data["Date"] = pd.to_datetime(data["Date"])
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Weekday"] = data["Date"].dt.weekday
data["IsWeekend"] = data["Weekday"] >= 5

In [67]:
# Select Features and Target
X = data.drop(columns=["Sales", "Customers", "Date"])
y = data["Sales"]

In [68]:
# Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [73]:
# Preprocessing Pipeline
numeric_features = ["CompetitionDistance", "Year", "Month", "Weekday"]
categorical_features = ["StoreType", "Assortment"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

In [74]:
# Model Pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

X_train.tail()
# y_train.head()

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,Weekday,IsWeekend,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
259178,426,7,0,0,0,0,2014,12,7,6,True,a,a,250.0,0.0,0.0,0,0.0,0.0,0
365838,510,5,1,0,0,1,2014,8,15,4,False,a,c,8260.0,0.0,0.0,0,0.0,0.0,0
131932,363,6,1,0,0,0,2015,4,4,5,True,a,a,250.0,9.0,2009.0,0,0.0,0.0,0
671155,711,4,1,1,0,0,2013,11,7,3,False,d,a,17110.0,3.0,2007.0,1,5.0,2010.0,"Jan,Apr,Jul,Oct"
121958,424,1,1,1,0,0,2015,4,13,0,False,d,c,1250.0,0.0,0.0,1,40.0,2011.0,"Jan,Apr,Jul,Oct"


In [75]:
# Train Model
model.fit(X_train, y_train)

In [76]:
# Validate Model
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"Validation MAE: {mae}")

Validation MAE: 1614.7132623420944


In [79]:
# Save Model
joblib.dump(model, f"../models/sales_model.pkl")

['../models/sales_model.pkl']