# Price Scraping Pipeline - Quick Exploration

Just checking that the mock scrapers + ETL pipeline work correctly before wiring up Airflow.
Obviously these are hardcoded prices so the results don't mean much, but the pipeline is the point.

In [None]:
import sys
sys.path.insert(0, "..")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from src.scrapers.retailers import AmazonScraper, WalmartScraper, CPI_CATEGORIES
from src.models.nowcast import InflationNowcaster, NowcastConfig, CPI_WEIGHTS

plt.style.use("seaborn-v0_8-whitegrid")
%matplotlib inline

## 1. Run the mock scrapers

Let's pull data from both Amazon and Walmart mock scrapers and see what we get.

In [None]:
amazon = AmazonScraper()
walmart = WalmartScraper()

all_products = []

for scraper in [amazon, walmart]:
    for cat in scraper.get_categories():
        products = scraper.scrape_category(cat)
        for p in products:
            all_products.append(p.model_dump())

df = pd.DataFrame(all_products)
print(f"Total products scraped: {len(df)}")
print(f"Retailers: {df['retailer'].unique()}")
print(f"Categories: {df['category'].unique()}")
df.head(10)

## 2. Basic stats

ok so the mock data is obviously not real prices but let's see if the pipeline works

In [None]:
print("--- Products per retailer ---")
print(df.groupby("retailer")["product_id"].count())
print()
print("--- Products per CPI category ---")
print(df.groupby("category")["product_id"].count())
print()
print("--- Price stats by category ---")
print(df.groupby("category")["price"].describe().round(2))

## 3. Price distributions by category

In [None]:
categories = df["category"].unique()

fig, axes = plt.subplots(1, len(categories), figsize=(4 * len(categories), 4), sharey=False)
if len(categories) == 1:
    axes = [axes]

for ax, cat in zip(axes, categories):
    subset = df[df["category"] == cat]
    ax.hist(subset["price"], bins=8, edgecolor="black", alpha=0.7)
    ax.set_title(cat.title())
    ax.set_xlabel("Price ($)")

axes[0].set_ylabel("Count")
fig.suptitle("Price Distribution by CPI Category", y=1.02, fontsize=13)
plt.tight_layout()
plt.show()

# the grocery category has the most variance which makes sense -
# bananas at $0.58 vs ground beef at ~$7

## 4. Simulated price time series

The scrapers add random noise each time you call them, so let's simulate a few weeks of daily scraping to get a time series.

In [None]:
# simulate 30 days of scraping by calling the scrapers repeatedly
np.random.seed(42)
ts_rows = []
base_date = datetime(2025, 1, 1)

for day in range(30):
    current_date = base_date + timedelta(days=day)
    for scraper in [AmazonScraper(), WalmartScraper()]:
        for cat in scraper.get_categories():
            for p in scraper.scrape_category(cat):
                row = p.model_dump()
                row["timestamp"] = current_date
                ts_rows.append(row)

ts_df = pd.DataFrame(ts_rows)
ts_df["timestamp"] = pd.to_datetime(ts_df["timestamp"])
print(f"Generated {len(ts_df)} price observations over 30 days")

In [None]:
# plot average daily price by category
daily_avg = ts_df.groupby([ts_df["timestamp"].dt.date, "category"])["price"].mean().reset_index()
daily_avg.columns = ["date", "category", "avg_price"]

fig, ax = plt.subplots(figsize=(10, 5))
for cat in daily_avg["category"].unique():
    subset = daily_avg[daily_avg["category"] == cat]
    ax.plot(subset["date"], subset["avg_price"], marker=".", label=cat, linewidth=1.5)

ax.set_xlabel("Date")
ax.set_ylabel("Average Price ($)")
ax.set_title("Daily Average Price by Category (Mock Data)")
ax.legend(loc="best", fontsize=9)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# recreation (electronics) is way higher than everything else, might want
# to normalize or use separate y-axes later

## 5. CPI Category Weights

These are the BLS basket weights we use for the weighted price index. Housing dominates at 42.4%.

In [None]:
weights_df = pd.DataFrame(
    [{"category": k, "weight": v} for k, v in CPI_WEIGHTS.items()]
).sort_values("weight", ascending=True)

fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(weights_df["category"], weights_df["weight"], color="steelblue", edgecolor="black")
ax.set_xlabel("Weight")
ax.set_title("CPI Basket Weights (BLS Approximation)")
for i, (_, row) in enumerate(weights_df.iterrows()):
    ax.text(row["weight"] + 0.005, i, f"{row['weight']:.1%}", va="center", fontsize=9)
plt.tight_layout()
plt.show()

## 6. Run a nowcast computation

Let's feed the simulated time series into the `InflationNowcaster` and see what comes out. Since we only have 30 days of mock data with random +/-5% noise, the inflation estimate will basically be noise too.

In [None]:
# need to add the 'date' column that the nowcaster expects
ts_df["date"] = ts_df["timestamp"].dt.date

config = NowcastConfig(
    base_period="2025-01-01",
    smoothing_window=7,
    min_observations=3,  # lowered because mock data is small
)
nowcaster = InflationNowcaster(config)
nowcaster.load_data(ts_df)

result = nowcaster.compute_nowcast(as_of_date=datetime(2025, 1, 30))

print(f"Price Index: {result.price_index:.2f}")
print(f"Inflation Rate: {result.inflation_rate:.2f}%")
print(f"Observations: {result.observation_count}")
print(f"Confidence Interval: ({result.confidence_interval[0]:.2f}, {result.confidence_interval[1]:.2f})")
print()
print("Category breakdown:")
for cat, idx in result.category_indices.items():
    contrib = result.category_contributions.get(cat, 0)
    print(f"  {cat:15s} index={idx:6.2f}  contribution={contrib:+.3f}pp")

The numbers are basically random since the mock scrapers just add uniform noise to hardcoded base prices. In production the idea is that real price changes would show up as actual inflation signal.

## 7. Quick ARIMA fit on sample data

Let's generate a longer fake inflation series and see if ARIMA can at least fit it. Using synthetic data with a slight upward trend + seasonality.

In [None]:
from src.models.forecast import InflationForecaster, ForecastConfig

# generate ~3 years of fake monthly inflation data
np.random.seed(123)
n_months = 36
dates = pd.date_range("2022-01-01", periods=n_months, freq="MS")
trend = np.linspace(2.0, 3.5, n_months)
seasonal = 0.3 * np.sin(2 * np.pi * np.arange(n_months) / 12)
noise = np.random.normal(0, 0.15, n_months)
inflation_series = trend + seasonal + noise

sample_data = pd.DataFrame({"date": dates, "inflation_rate": inflation_series})

# fit ARIMA
fc_config = ForecastConfig(
    model_type="arima",
    forecast_horizon=6,
    auto_order=False,
    arima_order=(1, 1, 1),
    seasonal_order=(1, 0, 1, 12),
)
forecaster = InflationForecaster(fc_config)
forecaster.fit(sample_data)

forecast = forecaster.forecast()
print("Forecast:")
for d, v in zip(forecast.dates, forecast.values):
    print(f"  {d.strftime('%Y-%m')}: {v:.2f}%")

In [None]:
# plot historical + forecast
fig, ax = plt.subplots(figsize=(10, 4))

ax.plot(sample_data["date"], sample_data["inflation_rate"], "b-o", markersize=4, label="Historical")
ax.plot(forecast.dates, forecast.values, "r--s", markersize=4, label="Forecast")
ax.fill_between(forecast.dates, forecast.confidence_lower, forecast.confidence_upper,
                color="red", alpha=0.15, label="95% CI")

ax.set_xlabel("Date")
ax.set_ylabel("Inflation Rate (%)")
ax.set_title("ARIMA(1,1,1) Forecast on Synthetic Data")
ax.legend()
plt.tight_layout()
plt.show()

# obviously this is just fitting noise + trend on fake data, but it shows
# the forecaster module works end-to-end

## Summary

Everything seems to work:
- Mock scrapers return valid `PriceData` objects with Pydantic validation
- CPI category mapping works (Amazon/Walmart categories -> BLS categories)
- Nowcaster computes a weighted price index from the scraped data
- ARIMA forecaster fits and produces forecasts with confidence intervals

Next steps: hook this up to the Airflow DAG and add Great Expectations validation checks.