In [6]:
import pandas as pd

# LOADING DATASET

In [9]:
train = pd.read_csv("train.csv")
stores = pd.read_csv("stores.csv")
features = pd.read_csv("features.csv")

In [10]:
train.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True


In [11]:
stores.head(2)

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307


In [12]:
features.head(2)

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True


# MERGING THE DATASET

In [18]:
df = pd.merge(train, features, on=["Store", "Date"], how="left")


In [20]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,False
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,True


In [21]:
df.shape

(421570, 15)

In [24]:
df = pd.merge(df, stores, on="Store", how="left")


In [26]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,True,A,151315


In [28]:
df.shape

(421570, 17)

In [30]:
df.isnull().sum()

Store                0
Dept                 0
Date                 0
Weekly_Sales         0
IsHoliday_x          0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
IsHoliday_y          0
Type                 0
Size                 0
dtype: int64

# CONVERT DATE TO DATETIME FORMAT

In [33]:
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)
df["Day"] = df["Date"].dt.day

# CONVERT BOOLEAN HOLIDAY TO INTEGER

In [36]:
df["IsHoliday_x"] = df["IsHoliday_x"].astype(int)


# SORT DATE BY DATE

In [39]:
df=df.sort_values("Date")

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 421570 entries, 0 to 421569
Data columns (total 21 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday_x   421570 non-null  int32         
 5   Temperature   421570 non-null  float64       
 6   Fuel_Price    421570 non-null  float64       
 7   MarkDown1     150681 non-null  float64       
 8   MarkDown2     111248 non-null  float64       
 9   MarkDown3     137091 non-null  float64       
 10  MarkDown4     134967 non-null  float64       
 11  MarkDown5     151432 non-null  float64       
 12  CPI           421570 non-null  float64       
 13  Unemployment  421570 non-null  float64       
 14  IsHoliday_y   421570 non-null  bool          
 15  Type          421570 n

In [42]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,MarkDown5,CPI,Unemployment,IsHoliday_y,Type,Size,Year,Month,Week,Day
0,1,1,2010-02-05,24924.5,0,42.31,2.572,,,,...,,211.096358,8.106,False,A,151315,2010,2,5,5
277665,29,5,2010-02-05,15552.08,0,24.36,2.788,,,,...,,131.527903,10.064,False,B,93638,2010,2,5,5


# HANDLING MISSING VALUES

In [46]:
markdown_cols = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]
df[markdown_cols] = df[markdown_cols].fillna(0)

In [48]:
print(df[markdown_cols].isnull().sum())

MarkDown1    0
MarkDown2    0
MarkDown3    0
MarkDown4    0
MarkDown5    0
dtype: int64


# ONE HOT ENCODE CATEGORICAL VARIABLE

In [51]:
df = pd.get_dummies(df, columns=["Type"], drop_first=True)

In [53]:
df.sample(5)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,CPI,Unemployment,IsHoliday_y,Size,Year,Month,Week,Day,Type_B,Type_C
89853,10,18,2011-03-11,713.72,0,64.22,3.63,0.0,0.0,0.0,...,128.3995,8.744,False,126512,2011,3,10,11,True,False
174169,18,58,2011-07-29,4830.0,0,73.85,3.898,0.0,0.0,0.0,...,135.963935,8.89,False,120653,2011,7,30,29,True,False
69263,8,4,2011-06-24,18770.01,0,83.94,3.594,0.0,0.0,0.0,...,218.497948,6.297,False,155078,2011,6,25,24,False,False
354777,38,8,2011-05-13,9145.74,0,70.93,4.202,0.0,0.0,0.0,...,129.089,13.736,False,39690,2011,5,19,13,False,True
82530,9,29,2010-12-17,3749.28,0,49.7,2.869,0.0,0.0,0.0,...,214.933494,6.56,False,125833,2010,12,50,17,True,False


# Checking Dupicates

In [56]:
duplicates = df[df.duplicated(keep=False)]
print(duplicates)


Empty DataFrame
Columns: [Store, Dept, Date, Weekly_Sales, IsHoliday_x, Temperature, Fuel_Price, MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown5, CPI, Unemployment, IsHoliday_y, Size, Year, Month, Week, Day, Type_B, Type_C]
Index: []

[0 rows x 22 columns]


In [58]:
df = df.drop('IsHoliday_y', axis=1)
print(df['IsHoliday_x'].value_counts())


IsHoliday_x
0    391909
1     29661
Name: count, dtype: int64


In [60]:
df.head(2)

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,MarkDown5,CPI,Unemployment,Size,Year,Month,Week,Day,Type_B,Type_C
0,1,1,2010-02-05,24924.5,0,42.31,2.572,0.0,0.0,0.0,...,0.0,211.096358,8.106,151315,2010,2,5,5,False,False
277665,29,5,2010-02-05,15552.08,0,24.36,2.788,0.0,0.0,0.0,...,0.0,131.527903,10.064,93638,2010,2,5,5,True,False


# Multiple Linear Regression

In [63]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [65]:
features = ['Store', 'Dept','Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday_x','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','Size']  # choose based on EDA
X = df[features]
y = df['Weekly_Sales']


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
model = LinearRegression()
model.fit(X_train, y_train)

In [71]:
y_pred = model.predict(X_test)
print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R-squared: 0.08759603547429728
RMSE: 21781.890536091018


In [73]:
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

              Coefficient
Store          -85.673696
Dept           111.404928
Temperature     27.365432
Fuel_Price    -571.483842
CPI            -23.877200
Unemployment  -165.852633
IsHoliday_x    671.587689
MarkDown1       -0.015255
MarkDown2        0.008864
MarkDown3        0.117037
MarkDown4        0.036621
MarkDown5        0.083810
Size             0.086061


# Using Non Linear Regression Analysis

## 1)RandomForestRegressor

In [76]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Checkpoint: Cell is running")

# Predict
y_pred = model.predict(X_test)


# Evaluate
print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Feature importance
importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["Importance"])
print(importances.sort_values("Importance", ascending=False))

Checkpoint: Cell is running
R-squared: 0.953191829596667
RMSE: 4933.587356243589
              Importance
Dept            0.619270
Size            0.191296
Store           0.068479
CPI             0.035835
Temperature     0.022490
MarkDown3       0.019891
Fuel_Price      0.016202
Unemployment    0.014484
IsHoliday_x     0.003775
MarkDown4       0.002456
MarkDown5       0.002292
MarkDown2       0.001794
MarkDown1       0.001737


## 2)HistGradientBoostingRegresssor

In [78]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

model = HistGradientBoostingRegressor( max_iter=200,
    learning_rate=0.05,
    max_depth=8,
    random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R-squared: 0.8844304102720357
RMSE: 7752.178696033755


In [79]:
from sklearn.inspection import permutation_importance
import pandas as pd

# Compute permutation importance
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)

# Convert to DataFrame
importances_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': result.importances_mean
})

# Sort and display
importances_df = importances_df.sort_values(by='Importance', ascending=False)
print(importances_df)

         Feature  Importance
1           Dept    1.485540
12          Size    0.442309
0          Store    0.109912
4            CPI    0.029981
9      MarkDown3    0.017259
2    Temperature    0.006151
5   Unemployment    0.004277
3     Fuel_Price    0.003582
6    IsHoliday_x    0.002028
10     MarkDown4    0.001551
11     MarkDown5    0.000383
8      MarkDown2    0.000353
7      MarkDown1    0.000032


In [80]:
%%capture
!pip install lightgbm

## 3)lightGBM

In [82]:
import lightgbm as lgb
from lightgbm import early_stopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

features = ['Store', 'Dept','Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday_x','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','Size']  # choose based on EDA
X = df[features]
y = df['Weekly_Sales']

# Sample splitting (replace X, y with your data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Predict
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R-squared: {r2:.4f}")

# Feature importance (analogous to coefficients)
importances = model.feature_importance()
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df)


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 5728.02
RMSE: 5728.0209
R-squared: 0.9369

Feature Importances:
         Feature  Importance
1           Dept       12973
0          Store        3962
12          Size        3643
4            CPI        2280
2    Temperature        1767
3     Fuel_Price        1595
5   Unemployment        1315
9      MarkDown3         849
8      MarkDown2         386
10     MarkDown4         366
11     MarkDown5         302
6    IsHoliday_x         287
7      MarkDown1         275
