In [135]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# LOADING DATASET

In [138]:
train = pd.read_csv("train.csv")
stores = pd.read_csv("stores.csv")
features = pd.read_csv("features.csv")

In [139]:
train.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True


In [140]:
stores.head(2)

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307


In [141]:
features.head(2)

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True


# MERGING THE DATASET

In [143]:
df = pd.merge(train, features, on=["Store", "Date"], how="left")


In [144]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,False
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,True


In [145]:
df.shape

(421570, 15)

In [146]:
df = pd.merge(df, stores, on="Store", how="left")


In [147]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,True,A,151315


In [148]:
df.shape

(421570, 17)

In [149]:
df.isnull().sum()

Store                0
Dept                 0
Date                 0
Weekly_Sales         0
IsHoliday_x          0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
IsHoliday_y          0
Type                 0
Size                 0
dtype: int64

In [150]:
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
df["Day"] = df["Date"].dt.day

# CONVERT BOOLEAN HOLIDAY TO INTEGER

In [153]:
df["IsHoliday_y"] = df["IsHoliday_y"].astype(int)


# SORT DATE BY DATE

In [167]:
df=df.sort_values("Date")

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 421570 entries, 0 to 421569
Data columns (total 21 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday_x   421570 non-null  bool          
 5   Temperature   421570 non-null  float64       
 6   Fuel_Price    421570 non-null  float64       
 7   MarkDown1     150681 non-null  float64       
 8   MarkDown2     111248 non-null  float64       
 9   MarkDown3     137091 non-null  float64       
 10  MarkDown4     134967 non-null  float64       
 11  MarkDown5     151432 non-null  float64       
 12  CPI           421570 non-null  float64       
 13  Unemployment  421570 non-null  float64       
 14  IsHoliday_y   421570 non-null  int32         
 15  Type          421570 n

In [171]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,MarkDown5,CPI,Unemployment,IsHoliday_y,Type,Size,Year,Month,Week,Day
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,...,,211.096358,8.106,0,A,151315,2010,2,5,5
277665,29,5,2010-02-05,15552.08,False,24.36,2.788,,,,...,,131.527903,10.064,0,B,93638,2010,2,5,5


In [173]:
df=df.drop(columns=['IsHoliday_x'])

In [175]:
df=df.rename(columns={'IsHoliday_y':'IsHoliday'})

# HANDLING MISSING VALUES

In [178]:
markdown_cols = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]
df[markdown_cols] = df[markdown_cols].fillna(0)

In [180]:
print(df[markdown_cols].isnull().sum())

MarkDown1    0
MarkDown2    0
MarkDown3    0
MarkDown4    0
MarkDown5    0
dtype: int64


# Checking Dupicates

In [183]:
duplicates = df[df.duplicated(keep=False)]
print(duplicates)


Empty DataFrame
Columns: [Store, Dept, Date, Weekly_Sales, Temperature, Fuel_Price, MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown5, CPI, Unemployment, IsHoliday, Type, Size, Year, Month, Week, Day]
Index: []


In [185]:
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Type,Size,Year,Month,Week,Day
0,1,1,2010-02-05,24924.5,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,A,151315,2010,2,5,5
277665,29,5,2010-02-05,15552.08,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,0,B,93638,2010,2,5,5
277808,29,6,2010-02-05,3200.22,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,0,B,93638,2010,2,5,5
277951,29,7,2010-02-05,10820.05,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,0,B,93638,2010,2,5,5
278094,29,8,2010-02-05,20055.64,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,0,B,93638,2010,2,5,5


In [187]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 421570 entries, 0 to 421569
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   Temperature   421570 non-null  float64       
 5   Fuel_Price    421570 non-null  float64       
 6   MarkDown1     421570 non-null  float64       
 7   MarkDown2     421570 non-null  float64       
 8   MarkDown3     421570 non-null  float64       
 9   MarkDown4     421570 non-null  float64       
 10  MarkDown5     421570 non-null  float64       
 11  CPI           421570 non-null  float64       
 12  Unemployment  421570 non-null  float64       
 13  IsHoliday     421570 non-null  int32         
 14  Type          421570 non-null  object        
 15  Size          421570 n

In [189]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# 1. Group by Store and Date to get weekly sales per store
df = df.groupby(['Store', 'Date'], as_index=False).agg({
    'Weekly_Sales': 'sum',
    'IsHoliday': 'first',
    'Temperature': 'mean',
    'Fuel_Price': 'mean',
    'MarkDown1': 'sum',
    'MarkDown2': 'sum',
    'MarkDown3': 'sum',
    'MarkDown4': 'sum',
    'MarkDown5': 'sum',
    'CPI': 'mean',
    'Unemployment': 'mean',
    'Type': 'first',
    'Size': 'first',
    'Year': 'first',
    'Month': 'first',
    'Week': 'first',
    'Day': 'first'
})

In [191]:
df.head(10)

Unnamed: 0,Store,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Week,Day
0,1,2010-02-05,1643690.9,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2010,2,5,5
1,1,2010-02-12,1641957.44,1,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315,2010,2,6,12
2,1,2010-02-19,1611968.17,0,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315,2010,2,7,19
3,1,2010-02-26,1409727.59,0,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315,2010,2,8,26
4,1,2010-03-05,1554806.68,0,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315,2010,3,9,5
5,1,2010-03-12,1439541.59,0,57.79,2.667,0.0,0.0,0.0,0.0,0.0,211.380643,8.106,A,151315,2010,3,10,12
6,1,2010-03-19,1472515.79,0,54.58,2.72,0.0,0.0,0.0,0.0,0.0,211.215635,8.106,A,151315,2010,3,11,19
7,1,2010-03-26,1404429.92,0,51.45,2.732,0.0,0.0,0.0,0.0,0.0,211.018042,8.106,A,151315,2010,3,12,26
8,1,2010-04-02,1594968.28,0,62.27,2.719,0.0,0.0,0.0,0.0,0.0,210.82045,7.808,A,151315,2010,4,13,2
9,1,2010-04-09,1545418.53,0,65.86,2.77,0.0,0.0,0.0,0.0,0.0,210.622857,7.808,A,151315,2010,4,14,9


In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         6435 non-null   int64         
 1   Date          6435 non-null   datetime64[ns]
 2   Weekly_Sales  6435 non-null   float64       
 3   IsHoliday     6435 non-null   int32         
 4   Temperature   6435 non-null   float64       
 5   Fuel_Price    6435 non-null   float64       
 6   MarkDown1     6435 non-null   float64       
 7   MarkDown2     6435 non-null   float64       
 8   MarkDown3     6435 non-null   float64       
 9   MarkDown4     6435 non-null   float64       
 10  MarkDown5     6435 non-null   float64       
 11  CPI           6435 non-null   float64       
 12  Unemployment  6435 non-null   float64       
 13  Type          6435 non-null   object        
 14  Size          6435 non-null   int64         
 15  Year          6435 non-null   int32   

## Linear Regression

In [195]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

features = ['Store' ,'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','Size']  # choose based on EDA
X = df[features]
y = df['Weekly_Sales'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

R-squared: 0.6979258816177084
RMSE: 311952.81068547204
               Coefficient
Store         -6799.002336
Temperature    1431.517364
Fuel_Price   -41024.770799
CPI           -1759.600666
Unemployment -11965.967785
IsHoliday     46299.800135
MarkDown1         0.005679
MarkDown2         0.005363
MarkDown3         0.121414
MarkDown4         0.021132
MarkDown5         0.087272
Size              6.767176


## Using Non Linear Regression Analysis

## 1)RandomForestRegressor

In [197]:
from sklearn.ensemble import RandomForestRegressor

In [199]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Checkpoint: Cell is running")


Checkpoint: Cell is running


In [201]:
y_pred = model.predict(X_test)

In [203]:
print("R-squared_RFR:", r2_score(y_test, y_pred))
print("RMSE_RFR:", np.sqrt(mean_squared_error(y_test, y_pred)))


R-squared_RFR: 0.9421874261410849
RMSE_RFR: 136471.91858354726


In [205]:
# Feature importance
importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["Importance"])
print(importances.sort_values("Importance", ascending=False))

              Importance
Size            0.701606
Store           0.146059
CPI             0.072483
MarkDown3       0.020607
Unemployment    0.019936
Fuel_Price      0.014901
Temperature     0.014842
MarkDown4       0.002459
MarkDown1       0.002170
MarkDown5       0.002069
IsHoliday       0.001717
MarkDown2       0.001151


## 2)HistGradientBoostingRegresssor

In [207]:
from sklearn.ensemble import HistGradientBoostingRegressor
model = HistGradientBoostingRegressor( max_iter=200,
    learning_rate=0.05,
    max_depth=8,
    random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [209]:
print("R-squared_HGBR:", r2_score(y_test, y_pred))
print("RMSE_HGBR:", np.sqrt(mean_squared_error(y_test, y_pred))) 

R-squared_HGBR: 0.9533744105633755
RMSE_HGBR: 122558.74699811089


## 3)Decision Tree Regressor

In [211]:
from sklearn.tree import DecisionTreeRegressor

In [213]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [215]:
print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R-squared: 0.8950093171376846
RMSE: 183910.92335178892


## 4) Bayesian Ridge

In [218]:
from sklearn.linear_model import BayesianRidge

In [220]:
model = BayesianRidge()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [222]:
print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R-squared: 0.6619282234631638
RMSE: 330017.2207403384
