In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("final_ml_dataset.csv", parse_dates=["sale_date"])

assert "target" in df.columns, "Target column missing"
assert df.isnull().sum().sum() == 0, "Dataset contains missing values"

df = df.sort_values("sale_date").reset_index(drop=True)

print("Rows:", df.shape[0])
print("Columns:", df.shape[1])


Rows: 631500
Columns: 29


In [3]:
TARGET = "target"

DROP_COLS = [
    "sale_date",
    "target"
]

X = df.drop(columns=DROP_COLS)
y = df[TARGET]


In [4]:
tscv = TimeSeriesSplit(n_splits=5)


In [None]:
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1
)


In [6]:
mae_scores = []
rmse_scores = []

for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    mae_scores.append(mean_absolute_error(y_val, preds))
    rmse_scores.append(np.sqrt(mean_squared_error(y_val, preds)))

print("Mean MAE:", np.mean(mae_scores))
print("Mean RMSE:", np.mean(rmse_scores))


KeyboardInterrupt: 

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631500 entries, 0 to 631499
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   sale_date                 631500 non-null  datetime64[ns]
 1   quantity_sold             631500 non-null  int64         
 2   day_of_week               631500 non-null  int64         
 3   week_of_year              631500 non-null  int64         
 4   month                     631500 non-null  int64         
 5   is_weekend                631500 non-null  int64         
 6   lag_1                     631500 non-null  float64       
 7   lag_7                     631500 non-null  float64       
 8   lag_14                    631500 non-null  float64       
 9   lag_28                    631500 non-null  float64       
 10  rolling_7_mean            631500 non-null  float64       
 11  rolling_14_mean           631500 non-null  float64       
 12  ro

In [33]:
df["sale_date"] = pd.to_datetime(df["sale_date"], errors="coerce")


In [34]:
split_date = df["sale_date"].quantile(0.8)

train_df = df[df["sale_date"] <= split_date]
test_df  = df[df["sale_date"] > split_date]

X_train = train_df.drop(columns=[target, "sale_date"])
y_train = train_df[target]

X_test = test_df.drop(columns=[target, "sale_date"])
y_test = test_df[target]


In [35]:
from sklearn.preprocessing import StandardScaler

scale_cols = [c for c in scale_cols if c in X_train.columns]

scaler = StandardScaler()
X_train.loc[:, scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_test.loc[:, scale_cols] = scaler.transform(X_test[scale_cols])


In [36]:
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

model.fit(X_train, y_train_log)

preds_log = model.predict(X_test)
preds = np.expm1(preds_log)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1863
[LightGBM] [Info] Number of data points in the train set: 505500, number of used features: 15
[LightGBM] [Info] Start training from score 3.753363


In [37]:
X_train = X_train.drop(columns=["quantity_sold"])
X_test  = X_test.drop(columns=["quantity_sold"])


In [38]:
baseline_pred = X_test["lag_7"]


In [39]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1688
[LightGBM] [Info] Number of data points in the train set: 505500, number of used features: 14
[LightGBM] [Info] Start training from score 48.723687


In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

preds = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, preds))
print("RMSE:", mean_squared_error(y_test, preds))


MAE: 8.08477226581544
RMSE: 115.14351078697915


In [41]:
print("Baseline MAE:", mean_absolute_error(y_test, baseline_pred))


Baseline MAE: 15.48163492063492


In [42]:
df[["quantity_sold", "target"]].head(10)


Unnamed: 0,quantity_sold,target
0,13,20.0
1,10,16.0
2,12,18.0
3,10,12.0
4,9,11.0
5,12,16.0
6,9,10.0
7,9,15.0
8,7,22.0
9,10,30.0


In [43]:
df[["quantity_sold", "target"]].tail(10)


Unnamed: 0,quantity_sold,target
631490,111,98.0
631491,92,92.0
631492,113,73.0
631493,75,58.0
631494,95,83.0
631495,98,77.0
631496,98,90.0
631497,96,95.0
631498,118,89.0
631499,98,70.0
