In [27]:
import pandas as pd

In [28]:
df=pd.read_csv('data\Retail_Dataset2.csv', parse_dates=['Date'])

In [29]:
df.head()

Unnamed: 0,Product_id,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Open,Promo,StateHoliday,SchoolHoliday,Petrol_price
0,786725,Product_0033,Whse_S,Category_005,2016-01-03,16000,1,0,0,0,91
1,786743,Product_1825,Whse_S,Category_006,2016-01-03,50000,1,0,0,0,85
2,786967,Product_0551,Whse_S,Category_030,2016-01-03,3000,1,0,0,0,85
3,786856,Product_0556,Whse_S,Category_030,2016-01-03,1000,1,0,0,0,93
4,899538,Product_1844,Whse_A,Category_018,2016-01-03,7,1,0,0,0,95


In [30]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')

In [31]:
import numpy as np 
df['Order_Demand'] = df['Order_Demand'].clip(lower=0)  # Just in case
df['Order_Demand'] = np.log1p(df['Order_Demand'])  # log(1 + x)

In [32]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['dayofweek'] = df['Date'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)


In [33]:
df['lag_1'] = df['Order_Demand'].shift(1)
df['rolling_mean_3'] = df['Order_Demand'].shift(1).rolling(3).mean()

In [34]:
df['lag_7'] = df['Order_Demand'].shift(7)
df['rolling_mean_7'] = df['Order_Demand'].shift(1).rolling(window=7).mean()
df['rolling_std_7'] = df['Order_Demand'].shift(1).rolling(window=7).std()


In [38]:
df = df.dropna(subset=['lag_1', 'rolling_mean_3','lag_7','rolling_mean_7','rolling_std_7'])

In [39]:
print(df[['lag_1', 'rolling_mean_3','lag_7','rolling_mean_7','rolling_std_7']].isnull().sum())
print(df['Product_id'].value_counts().head(10))  # Some IDs may only occur once or twice

lag_1             0
rolling_mean_3    0
lag_7             0
rolling_mean_7    0
rolling_std_7     0
dtype: int64
Product_id
899537     1
972379     1
975900     1
1025317    1
1033714    1
1022621    1
869375     1
1026792    1
1000798    1
986149     1
Name: count, dtype: int64


In [40]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Product_Code', 'Warehouse', 'Product_Category', 'StateHoliday']

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))


In [41]:
df['Open'] = df['Open'].astype(int)
df['Promo'] = df['Promo'].astype(int)
df['SchoolHoliday'] = df['SchoolHoliday'].astype(int)

In [42]:
features = [
    'Product_Code', 'Warehouse', 'Product_Category', 'Open', 'Promo',
    'StateHoliday', 'SchoolHoliday', 'Petrol_price',
    'year', 'month', 'day', 'dayofweek', 'is_weekend',
    'lag_1', 'rolling_mean_3'
]

X = df[features]
y = df['Order_Demand']

In [43]:
print("Start Date:", df['Date'].min())
print("End Date:", df['Date'].max())


Start Date: 2016-01-03 00:00:00
End Date: 2016-11-23 00:00:00


In [None]:
split_date = '2016-10-01'  # Last ~2 months used as test data

X_train = X[df['Date'] < split_date]
X_test = X[df['Date'] >= split_date]
y_train = y[df['Date'] < split_date]
y_test = y[df['Date'] >= split_date]

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")


Train size: (140076, 15), Test size: (29128, 15)


In [15]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 1.1457086303577855
RMSE: 1.5375501735662427
R2 Score: 0.7192494271546147




In [46]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=5,
    loss_function='RMSE',
    eval_metric='R2',
    cat_features=cat_cols,
    verbose=100
)
model.fit(X_train, y_train)

0:	learn: 0.0534959	total: 102ms	remaining: 1m 42s
100:	learn: 0.7221330	total: 7.49s	remaining: 1m 6s
200:	learn: 0.7296896	total: 16.7s	remaining: 1m 6s
300:	learn: 0.7333377	total: 24.9s	remaining: 57.9s
400:	learn: 0.7358067	total: 33.8s	remaining: 50.4s
500:	learn: 0.7382398	total: 43.9s	remaining: 43.7s
600:	learn: 0.7405885	total: 54.6s	remaining: 36.2s
700:	learn: 0.7428293	total: 1m 5s	remaining: 28.1s
800:	learn: 0.7444735	total: 1m 16s	remaining: 19.1s
900:	learn: 0.7467006	total: 1m 28s	remaining: 9.7s
999:	learn: 0.7488817	total: 1m 40s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1bd355b1d30>

In [47]:
y_pred = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 1.1119514389536622
RMSE: 1.4838365302827043
R2 Score: 0.7385225896101839


