In [2]:
# 1. Import Libraries
# ======================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")


In [58]:
# 2. Load Dataset
# ======================================
df = pd.read_csv("sales_forecasting.csv")

In [8]:
print(df)
df.head(10)

            Date  store_id  product_id  units_sold   price  promo_flag  \
17182 2020-07-01         1        1001          23  207.54           0   
43242 2020-08-30         1        1001         111  485.27           0   
467   2021-04-12         1        1001          15  345.92           0   
17659 2021-10-21         1        1001          27  250.21           0   
30954 2022-08-12         1        1001         152  157.74           0   
...          ...       ...         ...         ...     ...         ...   
43643 2021-10-05        50        1100          85  400.61           1   
21830 2022-04-10        50        1100          63  189.38           0   
38865 2022-05-15        50        1100         168  103.22           0   
16873 2022-05-23        50        1100         157   87.85           1   
29881 2022-05-31        50        1100          13  112.17           0   

        revenue  
17182   4773.42  
43242  53864.97  
467     5188.80  
17659   6755.67  
30954  23976.48  
...

Unnamed: 0,Date,store_id,product_id,units_sold,price,promo_flag,revenue
17182,2020-07-01,1,1001,23,207.54,0,4773.42
43242,2020-08-30,1,1001,111,485.27,0,53864.97
467,2021-04-12,1,1001,15,345.92,0,5188.8
17659,2021-10-21,1,1001,27,250.21,0,6755.67
30954,2022-08-12,1,1001,152,157.74,0,23976.48
19100,2020-04-10,1,1002,90,109.87,0,9888.3
34161,2020-06-10,1,1002,106,417.17,1,44220.02
2179,2020-06-28,1,1002,126,295.31,0,37209.06
2224,2020-08-12,1,1002,155,230.62,1,35746.1
20237,2020-08-25,1,1002,8,410.69,0,3285.52


In [98]:
# If Date is not datetime
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['store_id', 'product_id', 'Date'])

In [100]:
# 3. Basic Feature Engineering
# ======================================
# Extract date features
df['day']   = df['Date'].dt.day
df['month'] = df['Date'].dt.month
df['year']  = df['Date'].dt.year
df['week']  = df['Date'].dt.isocalendar().week.astype(int)
df['dayofweek'] = df['Date'].dt.dayofweek  # 0=Monday

AttributeError: Can only use .dt accessor with datetimelike values

In [12]:
# Lag features (previous demand)
df['lag_1'] = df.groupby(['store_id', 'product_id'])['units_sold'].shift(1)
df['lag_7'] = df.groupby(['store_id', 'product_id'])['units_sold'].shift(7)

In [14]:
# Rolling averages
df['roll_7']  = df.groupby(['store_id', 'product_id'])['units_sold'].shift(1).rolling(7).mean()
df['roll_30'] = df.groupby(['store_id', 'product_id'])['units_sold'].shift(1).rolling(30).mean()

In [None]:
print(df.shape)
print(df.head())

In [64]:
df['roll_3'] = df['units_sold'].rolling(3).mean()
df['roll_7'] = df['units_sold'].rolling(7).mean()
df = df.dropna()

In [72]:
# Create features – keep only what data can support
df['lag_1'] = df['units_sold'].shift(1)
df['lag_7'] = df['units_sold'].shift(7)
df['roll_7'] = df['units_sold'].rolling(7).mean()

# Drop missing rows
df = df.dropna()

X = df[['lag_1', 'lag_7', 'roll_7']]
y = df['units_sold']

print("Rows left after feature engineering:", len(df))

from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

Rows left after feature engineering: 49980


In [81]:
#5. Linear Regression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [None]:
print(lr_pred)

In [None]:
# 6. MODEL 2 — Random Forest
# ======================================
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [None]:
print(rf_pred)
print(rf.fit)

In [103]:
# 9. Save Predictions
# ======================================
output = pd.DataFrame({
    "Date": df.iloc[X_test.index]['Date'],
    "store_id": df.iloc[X_test.index]['store_id'],
    "product_id": df.iloc[X_test.index]['product_id'],
    "actual_units_sold": y_test,
    "predicted_units_sold": rf_pred
})

output.to_csv("sales_forecast_output.csv", index=False)

print("\nForecast saved as: sales_forecast_output.csv")



Forecast saved as: sales_forecast_output.csv


In [105]:
# 10. Save Model
# ======================================
import joblib
joblib.dump(rf, "sales_forecast_model.pkl")
print("Model saved as: sales_forecast_model.pkl")

Model saved as: sales_forecast_model.pkl
