In [None]:
import pandas as pd 
import matplotlib.pyplot as plt

# Load synthetic data
df = pd.read_csv("../data/sample_sales.csv", parse_dates=["date"])
print(df.head())

# Plot sales over time
plt.figure(figsize=(10,5))
for product in df['product'].unique():
    plt.plot(df[df['product'] == product]['date'], df[df['product'] == product]['units_sold'], label=f'Product: {product}')
plt.title("Sales Trend per Product")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.grid()
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("/Users/da/Documents/TARGET/targetmart-forecasting/data/sample_sales.csv", parse_dates=["date"])

# Lag feature: previous day's sales
df["sales_lag_1"] = df.groupby("product")["units_sold"].shift(1)

# Sort values (important for time series)
df.sort_values(by=["product", "date"], inplace=True)

# Rolling average features
df["rolling_avg_3"] = df.groupby("product")["units_sold"].shift(1).rolling(window=3).mean()
df["rolling_avg_7"] = df.groupby("product")["units_sold"].shift(1).rolling(window=7).mean()

# Promo flag as categorical (optional but helps with models like XGBoost)
df["promo_flag"] = df["promo"].astype("category")

# Day of the week
df["day_of_week"] = df["date"].dt.dayofweek # 0 = Monday, 6 = Sunday

# Weekend indicator
df["is_weekend"] = df["is_weekend"] = df["day_of_week"].isin([5,6]).astype(int)

# Price elasticity proxy: % change in sales per unit change in price
df["price_change"] = df.groupby("product")["price"].pct_change()
df["sales_change"] = df.groupby("product")["units_sold"].pct_change()
df["price_elasticity"] = df["sales_change"] / df["price_change"]

# Drop NA rows generated by lag/rolling
df.dropna(inplace=True)

# Replace infinities from price_elasticity calc
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)  # or use imputation if needed

# Preview
print(df.head())


In [None]:
print("Date range in df:", df["date"].min(), "to", df["date"].max())


In [None]:
from sklearn.metrics import mean_absolute_percentage_error
from datetime import timedelta

# Sort data
df = df.sort_values(["product", "date"])

# Set forecast horizon (last N days as test)
forecast_days = 7

# Split train-test set by date for each product
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for product in df["product"].unique():
    product_data = df[df["product"] == product].copy()
    cutoff_date = product_data["date"].max() - timedelta(days=forecast_days)

    train_df = pd.concat([train_df, product_data[product_data["date"] <= cutoff_date]])
    test_df = pd.concat([test_df, product_data[product_data["date"] > cutoff_date]])

# Baseline model: predict sales = previous day's sales (lag-1)
test_df["baseline_pred"] = test_df["sales_lag_1"]

# Evaluate using MAPE
mape = mean_absolute_percentage_error(test_df["units_sold"], test_df["baseline_pred"])
print(f"Baseline Forecast MAPE: {mape:.2%}")

In [None]:
print("Date range in df:", df["date"].min(), "to", df["date"].max())


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error

# split_date = "2023-01-09"
split_date = df["date"].iloc[int(len(df) * 0.8)] # 80% train, 20% test

train_df = df[df["date"] < split_date].copy()
test_df = df[df["date"] >= split_date].copy()

# Lag-based baseline
test_df["baseline_pred"] = test_df["sales_lag_1"]

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


# Encode prodcut labels (if needed)

#le = LabelEncoder()
#train_df["product_enc"] = le.fit_transform(train_df["product"])
#test_df["product_enc"] = le.transform(test_df["product"])

# Error: ValueError: y contains previously unseen labels: 'Shampoo'
#because the LabelEncoder was fitted on the training set only, 
# but our test set (test_df) contains new product(s) like 'Shampoo' that weren’t seen during 
# training — and LabelEncoder can’t handle that by default.

# Fix: Use fit_transform on full data and then split
# This way, all product categories (train + test) are included in the encoding. 

# Combine for consistent encoding
# all_products = pd.concat([train_df["product"], test_df["product"]], axis=0)
all_products = pd.concat([train_df["product"], test_df["product"]])

# Encode product using full data to avoid unseen label error
le = LabelEncoder().fit(all_products)

# Now transforming separately

train_df["product_enc"] = le.transform(train_df["product"])
test_df["product_enc"] = le.transform(test_df["product"])

# Define features and target
feature_cols = [
    "price", "promo", "promo_flag", "day_of_week", "is_weekend",
    "sales_lag_1", "rolling_avg_3", "rolling_avg_7",
    "price_change", "sales_change", "price_elasticity", "product_enc"
]

target = "units_sold"

# Drop NAs
train_df = train_df.dropna(subset=feature_cols + [target])
test_df = test_df.dropna(subset=feature_cols + [target])

# 🚨 Safety check
if train_df.empty:
    raise ValueError("train_df is empty after dropping NAs. " \
    "Please check your data and feature engineering steps")

# 🚨 Train separate models per product
rf_models = {}
for product in train_df["product"].unique():
    train_p = train_df[train_df["product"] == product]
    test_p = test_df[test_df["product"] == product] 

    if train_p.empty or test_p.empty:
        print(f"Skipping {product} due to empty train or test set.")
        continue

    # Train model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(train_p[feature_cols], train_p[target])
    rf_models[product] = rf
    # Error: ValueError: Found array with 0 sample(s) (shape=(0, 12)) 
    # while a minimum of 1 is required by RandomForestRegressor.

    # ...means the train_df is empty — it has zero rows, so RandomForestRegressor 
    # has no data to train on.

    # Fix: Check shape of train_df
    # print("train_df shape:", train_df.shape)
    # print(train_df.head())

    # rf.fit(train_df[features], train_df[target])

    # Predict
    test_df.loc[test_p.index, "rf_pred"] = rf.predict(test_p[feature_cols])

# Evaluate
mape_rf = mean_absolute_percentage_error(test_df[target], test_df["rf_pred"])
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(f"Random Forest MAPE: {mape_rf:.2%}")

In [None]:
# Python code to visualize actual vs predicted sales

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error

sns.set(style="whitegrid")

# Make sure date is datetime
test_df["date"] = pd.to_datetime(test_df["date"])

# Plot per product
for product in test_df["product"].unique():
    product_test = test_df[test_df["product"] == product].copy()

    plt.figure(figsize=(10, 5))
    plt.plot(product_test["date"], product_test["units_sold"], label="Actual Units Sold", marker="o")

    if "baseline_pred" in product_test.columns:
        plt.plot(product_test["date"], product_test["baseline_pred"], label="Predicted (Lag-1)", marker="x")

    if "rf_pred" in product_test.columns:
        plt.plot(product_test["date"], product_test["rf_pred"], label="Random Forest", marker="^")

    plt.title(f"Actual vs Predicted Sales for {product}")
    plt.xlabel("Date")
    plt.ylabel("Units Sold")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

    # 💡 Print MAPE after the plot
    if "baseline_pred" in product_test.columns:
        mape_baseline = mean_absolute_percentage_error(product_test["units_sold"], product_test["baseline_pred"])
        print(f"{product} - Baseline (Lag-1) MAPE: {mape_baseline:.2%}")
    if "rf_pred" in product_test.columns:
        mape_rf = mean_absolute_percentage_error(product_test["units_sold"], product_test["rf_pred"])
        print(f"{product} - Random Forest MAPE: {mape_rf:.2%}")


In [None]:
# Feature Importance Visualization

importances_df = pd.DataFrame()

for product, model in rf_models.items():
    importances = model.feature_importances_
    product_importance = pd.DataFrame({
        "feature": feature_cols,
        "importance": importances,
        "product": product
    })
    importances_df = pd.concat([importances_df, product_importance], axis=0)

# Plotting feature importance per product
plt.figure(figsize=(12, 6))
sns.barplot(data=importances_df, x="importance", y="feature", hue="product")
plt.title("Feature Importance by Product")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.legend(title="Product")
plt.tight_layout()
plt.savefig("feature_importance_by_product.png", dpi=300)
plt.show()


Based on the feature importance plot we just generated (from the Random Forest models per product), we can interpret which features are driving predictions for each product as follows:

🔑 Top Features Driving Predictions
🧴 Shampoo:
sales_lag_1 – 📊 Most influential: sales from the previous day are strong predictors.

price_elasticity – 🧮 Indicates customers are sensitive to price changes.

sales_change – 📈 Recent trend in sales (change from lag to current).

day_of_week, is_weekend – 🗓️ Suggests buying pattern shifts on certain days (e.g., weekends).

promo – 📣 Promotions matter but less than above.

🧼 Soap:
rolling_avg_7 – 🧠 Customers seem to have steady buying habits over time.

sales_lag_1 – Still a good predictor but not the most important.

is_weekend – 📅 Consumers possibly buy more on weekends.

price_elasticity – Moderate effect — possibly less price sensitive.

promo, sales_change – Minor contributors.

🪥 Toothpaste:
sales_lag_1 – 🥇 Most important, again confirming daily consistency.

rolling_avg_7 – Shows habitual buying over a week.

sales_change – Captures small demand spikes/dips.

price_elasticity – Also important, indicating price sensitivity.

promo, day_of_week – Lesser but non-trivial.

🧠 What Does This Mean?
Lag features (especially sales_lag_1) dominate — users tend to repeat purchase patterns.

Price sensitivity (price_elasticity) is consistently influential → You can optimize pricing.

Temporal patterns (day of week, weekend flags) are useful for scheduling promotions.

In [None]:
# Data Augmentation

def augment_data(df_product, n=30):
    df_aug = pd.concat([df_product] * (n // len(df_product)), ignore_index=True)
    df_aug = df_aug.sample(n=n, replace=True).reset_index(drop=True)

    # Add small noise to numerical columns except 'units_sold'
    numeric_cols = [col for col in df_aug.columns if df_aug[col].dtype in ['int64', 'float64'] and col not in ['units_sold']]
    for col in numeric_cols:
        noise = np.random.normal(0, 0.05, size=len(df_aug)) # 5% Gaussian noise
        df_aug[col] = df_aug[col] * (1 + noise)

    # Add noise to 'units_sold' too
    units_noise = np.random.randint(-2, 3, size=len(df_aug)) # -2 to +2
    df_aug['units_sold'] = np.clip(df_aug['units_sold'] + units_noise, 0, None)

    return df_aug

# Apply augmentation per product
df_augmented = df.groupby('product', group_keys=False).apply(lambda x: augment_data(x, n=30)).reset_index(drop=True)

print(df_augmented['product'].value_counts())


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np

# Encode categorical variables again for df_augmented
le = LabelEncoder()
df_augmented['product_enc'] = le.fit_transform(df_augmented['product'])

# If you have other encodings (like 'day_of_week' or 'month'), do those too:
df_augmented['day_of_week'] = pd.to_datetime(df_augmented['date']).dt.dayofweek
df_augmented['month'] = pd.to_datetime(df_augmented['date']).dt.month

missing = [col for col in feature_cols if col not in df_augmented.columns]
print("❌ Missing columns:", missing)

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

best_models = {}

for product in df_augmented['product'].unique():
    print(f"\n🔍 Tuning model for: {product}")

    df_product = df_augmented[df_augmented['product'] == product]

    X = df_product[feature_cols]
    y = df_product['units_sold']

    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X, y)

    print("✅ Best Params:", grid_search.best_params_)
    print("📉 Best MAE:", -grid_search.best_score_)

    best_models[product] = grid_search.best_estimator_

# Compute RMSE
rmse = np.sqrt(mean_squared_error(test_df["units_sold"], test_df["rf_pred"]))
print(f"Random Forest RMSE: {rmse:.2f}")

X_sim = 


In [None]:
import os
import joblib

model_dir = "/Users/da/Documents/TARGET/targetmart-forecasting/models"

# ✅ Only create the correct directory
os.makedirs(model_dir, exist_ok=True)

for product, model in best_models.items():
    joblib.dump(model, f"{model_dir}/best_rf_model_{product}.pkl")

joblib.dump(feature_cols, f"{model_dir}/feature_cols.pkl")

joblib.dump(le, f"{model_dir}/label_encoder_product.pkl")
