In [11]:
import pandas as pd

# Load only 100k rows with relevant columns
df = pd.read_parquet(
    DATA_DIR / "features_rossmann.parquet",
    columns=[
        "Sales", "Store", "DayOfWeek", "Promo", "SchoolHoliday", "StateHoliday",
        "SalesLag1", "SalesLag7", "SalesMA7", "Year", "Month", "Week", "WeekOfYear", "IsWeekend"
    ]
).iloc[:100_000]  # Load only the first 100,000 rows

df = df.dropna(subset=["Sales", "SalesLag1", "SalesLag7", "SalesMA7"])
df["StateHoliday"] = df["StateHoliday"].astype("category").cat.codes


In [12]:
# Optimize data types before dropna
df["Store"] = df["Store"].astype("int32")
df["DayOfWeek"] = df["DayOfWeek"].astype("int8")
df["Promo"] = df["Promo"].astype("int8")
df["SchoolHoliday"] = df["SchoolHoliday"].astype("int8")
df["StateHoliday"] = df["StateHoliday"].astype("category").cat.codes
df["SalesLag1"] = df["SalesLag1"].astype("float32")
df["SalesLag7"] = df["SalesLag7"].astype("float32")
df["SalesMA7"] = df["SalesMA7"].astype("float32")
df["Year"] = df["Year"].astype("int16")
df["Month"] = df["Month"].astype("int8")
df["Week"] = df["Week"].astype("int8")
df["WeekOfYear"] = df["WeekOfYear"].astype("int8")
df["IsWeekend"] = df["IsWeekend"].astype("int8")


In [13]:
# Load and filter to a few stores
df = pd.read_parquet(DATA_DIR / "features_rossmann.parquet")
df = df[df["Store"].isin([1, 2, 3, 4, 5])]


In [35]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

# -----------------------------
# ✅ Step 1: Path Setup
# -----------------------------
PROJECT_ROOT = Path(r"C:\Users\Arushi Sharma\Documents\retail_demand_forecasting")
DATA_DIR = PROJECT_ROOT / "data"
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# -----------------------------
# ✅ Step 2: Load Subset of Data
# -----------------------------
columns_to_use = [
    "Sales", "Store", "DayOfWeek", "Promo", "SchoolHoliday", "StateHoliday",
    "SalesLag1", "SalesLag7", "SalesMA7", "Year", "Month", "WeekOfYear", "IsWeekend"
]
df = pd.read_parquet(DATA_DIR / "features_rossmann.parquet", columns=columns_to_use).iloc[:100_000]

# -----------------------------
# ✅ Step 3: Preprocessing
# -----------------------------
df = df.dropna(subset=["Sales", "SalesLag1", "SalesLag7", "SalesMA7"])  # Drop rows with missing values

df["Store"] = df["Store"].astype("int32")
df["DayOfWeek"] = df["DayOfWeek"].astype("int8")
df["Promo"] = df["Promo"].astype("int8")
df["SchoolHoliday"] = df["SchoolHoliday"].astype("int8")
df["StateHoliday"] = df["StateHoliday"].astype("category").cat.codes
df["SalesLag1"] = df["SalesLag1"].astype("float32")
df["SalesLag7"] = df["SalesLag7"].astype("float32")
df["SalesMA7"] = df["SalesMA7"].astype("float32")
df["Year"] = df["Year"].astype("int16")
df["Month"] = df["Month"].astype("int8")
df["WeekOfYear"] = df["WeekOfYear"].astype("int8")
df["IsWeekend"] = df["IsWeekend"].astype("int8")

# -----------------------------------------
# ✅ Step 4: Train-Test Split (NO PRICE)
# -----------------------------------------
FEATURES = [
    "Store", "DayOfWeek", "Promo", "SchoolHoliday", "StateHoliday",
    "SalesLag1", "SalesLag7", "SalesMA7", "Year", "Month", "WeekOfYear", "IsWeekend"
]
TARGET = "Sales"

X = df[FEATURES]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------------------
# ✅ Step 5: Hyperparameter Tuning
# -----------------------------------------
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

xgb = XGBRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring="neg_mean_squared_error",
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

# -----------------------------------------
# ✅ Step 6: Evaluate & Save
# -----------------------------------------
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Best Parameters:", random_search.best_params_)
print(f"Tuned Model RMSE: {rmse:.2f}")

# Save trained model and features
joblib.dump(best_model, MODEL_DIR / "xgb_best_model.pkl")
joblib.dump(FEATURES, MODEL_DIR / "xgb_best_model_features.pkl")

print("✅ Model and feature list saved successfully.")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 1.0, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Tuned Model RMSE: 881.90
✅ Model and feature list saved successfully.


In [28]:
import shap
import matplotlib.pyplot as plt
from pathlib import Path

# Optional: Set plot directory
PLOT_DIR = Path("plots")
PLOT_DIR.mkdir(exist_ok=True)

# Create SHAP explainer for tuned XGBoost model
explainer = shap.Explainer(best_model, X_train)

# Calculate SHAP values for test data
shap_values = explainer(X_test)

# Plot and save summary plot
plt.figure()
shap.summary_plot(shap_values, X_test, show=False)  # Don't auto-show, so we can save
plt.savefig(PLOT_DIR / "shap_xgb_best_model.png", bbox_inches='tight', dpi=300)
plt.close()

print("✅ SHAP plot saved to:", PLOT_DIR / "shap_xgb_best_model.png")




✅ SHAP plot saved to: plots\shap_xgb_best_model.png


In [29]:
plt.figure()
shap.plots.bar(shap_values, show=False)
plt.savefig(PLOT_DIR / "shap_bar_xgb_best_model.png", bbox_inches='tight', dpi=300)
plt.close()


In [30]:
# 🧪 Price Simulation: Sales Forecast Under Price Changes

import matplotlib.pyplot as plt

# ✅ Use the same features used for training
expected_features = X_train.columns.tolist()

# Ensure "AvgPrice" exists
if "AvgPrice" not in X.columns:
    print("❌ 'AvgPrice' column not found in dataset.")
else:
    variants = {
        "Price -20%": 0.80,
        "Price -10%": 0.90,
        "Price +10%": 1.10,
        "Price +20%": 1.20,
    }

    baseline_preds = best_model.predict(X_test)

    # Plot setup
    plt.figure(figsize=(12, 6))
    plt.plot(baseline_preds[:100], label="Original", linewidth=2)

    for label, factor in variants.items():
        X_sim = X_test.copy()
        X_sim["AvgPrice"] = X_sim["AvgPrice"] * factor
        y_sim = best_model.predict(X_sim)
        plt.plot(y_sim[:100], label=label, linestyle="--")

    plt.title("🧪 Forecast Comparison: Price Simulation")
    plt.xlabel("Sample Index")
    plt.ylabel("Predicted Sales")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


❌ 'AvgPrice' column not found in dataset.


In [31]:
import pandas as pd

summary = []

for label, factor in variants.items():
    X_sim = X_test.copy()
    X_sim["AvgPrice"] = X_sim["AvgPrice"] * factor
    y_sim = best_model.predict(X_sim)
    
    avg_delta = (y_sim - baseline_preds).mean()
    summary.append({"Scenario": label, "Avg Change in Sales": avg_delta})

pd.DataFrame(summary)


NameError: name 'variants' is not defined