In [3]:
import pandas as pd

# Load data
df = pd.read_csv('../data/raw/2019-Nov.csv')
df['event_time'] = pd.to_datetime(df['event_time'])

# Filter purchases only
purchases = df[df['event_type'] == 'purchase'].copy()

# Add date-only column
purchases['purchase_date'] = purchases['event_time'].dt.date

# Aggregate demand per product per day
daily_demand = purchases.groupby(['product_id', 'purchase_date'])['user_id'].count().reset_index()
daily_demand.columns = ['product_id', 'ds', 'y']  # Prophet needs 'ds' and 'y'


In [4]:
# Find top-selling products
top_products = purchases['product_id'].value_counts().head(5).index.tolist()

# Filter for just one product (we'll automate later)
prod_df = daily_demand[daily_demand['product_id'] == top_products[0]].copy()
prod_df = prod_df.drop(columns='product_id')


In [9]:
top_products = (
    daily_demand.groupby('product_id')['y'].sum()
    .sort_values(ascending=False)
    .head(10)
    .index.tolist()
)


In [10]:
from prophet import Prophet
import os

# Create output directory if it doesn't exist
os.makedirs('../data/processed/forecast_outputs', exist_ok=True)

# Loop through each top product
for product_id in top_products:
    # Filter product-specific demand
    prod_df = daily_demand[daily_demand['product_id'] == product_id].copy()
    prod_df = prod_df.drop(columns='product_id')

    # Initialize and train Prophet model
    model = Prophet()
    model.fit(prod_df)

    # Create future 30-day dataframe
    future = model.make_future_dataframe(periods=30)

    # Predict
    forecast = model.predict(future)
    forecast['product_id'] = product_id

    # Save the last 30-day forecast
    output = forecast[['ds', 'yhat', 'product_id']].tail(30)
    output.to_csv(f'../data/processed/forecast_outputs/forecast_{product_id}.csv', index=False)

    print(f"✅ Forecast saved for Product ID {product_id}")


22:46:35 - cmdstanpy - INFO - Chain [1] start processing
22:46:35 - cmdstanpy - INFO - Chain [1] done processing
22:46:35 - cmdstanpy - INFO - Chain [1] start processing
22:46:35 - cmdstanpy - INFO - Chain [1] done processing
22:46:35 - cmdstanpy - INFO - Chain [1] start processing
22:46:35 - cmdstanpy - INFO - Chain [1] done processing


✅ Forecast saved for Product ID 1004856
✅ Forecast saved for Product ID 1004767


22:46:36 - cmdstanpy - INFO - Chain [1] start processing
22:46:36 - cmdstanpy - INFO - Chain [1] done processing
22:46:36 - cmdstanpy - INFO - Chain [1] start processing
22:46:36 - cmdstanpy - INFO - Chain [1] done processing


✅ Forecast saved for Product ID 1005115
✅ Forecast saved for Product ID 4804056


22:46:36 - cmdstanpy - INFO - Chain [1] start processing
22:46:36 - cmdstanpy - INFO - Chain [1] done processing
22:46:36 - cmdstanpy - INFO - Chain [1] start processing
22:46:36 - cmdstanpy - INFO - Chain [1] done processing


✅ Forecast saved for Product ID 1004833
✅ Forecast saved for Product ID 1002544
✅ Forecast saved for Product ID 1004870


22:46:36 - cmdstanpy - INFO - Chain [1] start processing
22:46:36 - cmdstanpy - INFO - Chain [1] done processing
22:46:36 - cmdstanpy - INFO - Chain [1] start processing
22:46:36 - cmdstanpy - INFO - Chain [1] done processing
22:46:36 - cmdstanpy - INFO - Chain [1] start processing


✅ Forecast saved for Product ID 1005100
✅ Forecast saved for Product ID 1004249


22:46:36 - cmdstanpy - INFO - Chain [1] done processing


✅ Forecast saved for Product ID 1005105


In [8]:
# Add product ID back in
forecast['product_id'] = top_products[0]

# Save just future dates
forecast_out = forecast[['ds', 'yhat', 'product_id']].tail(30)
forecast_out.to_csv('../data/processed/demand_forecast_product_1.csv', index=False)
