In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv("Data/clean_data_1.csv", parse_dates=['InvoiceDate'])

  df = pd.read_csv("Data/clean_data_1.csv", parse_dates=['InvoiceDate'])


In [None]:
df['Revenue'] = df['Quantity'] * df['UnitPrice']
stock_code = df['StockCode'].value_counts().idxmax()  # most frequent product
print(f"Forecasting for StockCode: {stock_code}")


Forecasting for StockCode: 85123A


In [13]:
df_prod = df[df['StockCode'] == stock_code]
ts = (
    df_prod.set_index('InvoiceDate')
    .resample('M')
    .agg({'Quantity': 'sum', 'Revenue': 'sum'})
    .reset_index()
)

# Create time-based features
ts['month_num'] = ((ts['InvoiceDate'].dt.year - ts['InvoiceDate'].dt.year.min()) * 12
                   + ts['InvoiceDate'].dt.month)
ts['month_sin'] = np.sin(2 * np.pi * ts['InvoiceDate'].dt.month / 12)
ts['month_cos'] = np.cos(2 * np.pi * ts['InvoiceDate'].dt.month / 12)

# 5. Train/test split (last 3 months as test)
train_ts = ts.iloc[:-3]
test_ts = ts.iloc[-3:]

X_train = train_ts[['month_num', 'month_sin', 'month_cos']]
y_train_qty = train_ts['Quantity']
y_train_rev = train_ts['Revenue']

X_test = test_ts[['month_num', 'month_sin', 'month_cos']]
y_test_qty = test_ts['Quantity']
y_test_rev = test_ts['Revenue']

base_models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'XGBoost'             : XGBRegressor(random_state=42, 
                                          n_estimators=100,
                                          learning_rate=0.1,
                                          objective='reg:squarederror')
}

results = {}
for name, base_model in base_models.items():
    # Quantity model
    model_q = clone(base_model)
    model_q.fit(X_train, y_train_qty)
    yq_pred = model_q.predict(X_test)
    rmse_q = np.sqrt(mean_squared_error(y_test_qty, yq_pred))

    # Revenue model
    model_r = clone(base_model)
    model_r.fit(X_train, y_train_rev)
    yr_pred = model_r.predict(X_test)
    rmse_r = np.sqrt(mean_squared_error(y_test_rev, yr_pred))

    results[name] = {'RMSE_Quantity': rmse_q, 'RMSE_Revenue': rmse_r}

results_df = pd.DataFrame(results).T
print("\nModel evaluation (RMSE):")
print(results_df)


Model evaluation (RMSE):
                  RMSE_Quantity  RMSE_Revenue
LinearRegression    4259.498684   6344.636927
RandomForest        1667.305916   4797.690451
GradientBoosting    1634.450746   4389.533610
XGBoost             1642.416132   5118.525901


In [14]:
# 7. Retrain best models on full data and forecast next 3 months
best_qty_name = results_df['RMSE_Quantity'].idxmin()
best_rev_name = results_df['RMSE_Revenue'].idxmin()
print(f"\nBest model for Quantity: {best_qty_name}")
print(f"Best model for Revenue: {best_rev_name}")

# Retrain on full series
X_full = ts[['month_num', 'month_sin', 'month_cos']]
y_full_qty = ts['Quantity']
y_full_rev = ts['Revenue']

best_qty = clone(base_models[best_qty_name]).fit(X_full, y_full_qty)
best_rev = clone(base_models[best_rev_name]).fit(X_full, y_full_rev)

# Create future dates
last_date = ts['InvoiceDate'].max()
future_dates = pd.date_range(last_date + pd.offsets.MonthEnd(1), periods=3, freq='M')
future_df = pd.DataFrame({'InvoiceDate': future_dates})
future_df['month_num'] = ((future_df['InvoiceDate'].dt.year - ts['InvoiceDate'].dt.year.min()) * 12
                          + future_df['InvoiceDate'].dt.month)
future_df['month_sin'] = np.sin(2 * np.pi * future_df['InvoiceDate'].dt.month / 12)
future_df['month_cos'] = np.cos(2 * np.pi * future_df['InvoiceDate'].dt.month / 12)

# Predict next 3 months
X_future = future_df[['month_num', 'month_sin', 'month_cos']]
future_df['Predicted_Quantity'] = best_qty.predict(X_future)
future_df['Predicted_Revenue'] = best_rev.predict(X_future)

print("\nForecast for next 3 months:")
print(future_df[['InvoiceDate', 'Predicted_Quantity', 'Predicted_Revenue']])


Best model for Quantity: GradientBoosting
Best model for Revenue: GradientBoosting

Forecast for next 3 months:
  InvoiceDate  Predicted_Quantity  Predicted_Revenue
0  2012-01-31         2286.945963       10201.402788
1  2012-02-29          945.805675        3950.005964
2  2012-03-31         1030.360933        4172.283851
