In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =========================

In [4]:
# 1) Config
# =========================
BASE_DIR = "/Users/zhangnan/Downloads"
CSV_PATH = os.path.join(BASE_DIR, "ad_vs_ranking.csv")
OUT_DIR = os.path.join(BASE_DIR, "assets")
os.makedirs(OUT_DIR, exist_ok=True)

# =========================

In [5]:
# 2) Load CSV
# =========================
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

# =========================

In [6]:
# 3) Clean & types
# =========================
df["weekly_spend"] = pd.to_numeric(df["weekly_spend"], errors="coerce")
df["ranking_score_weekly"] = pd.to_numeric(df["ranking_score_weekly"], errors="coerce")
df["week_of_year"] = pd.to_numeric(df["week_of_year"], errors="coerce").astype("Int64")

# Drop missing essentials
df = df.dropna(subset=["store_id", "week_of_year", "weekly_spend", "ranking_score_weekly"]).copy()
df = df[df["weekly_spend"] >= 0].copy()

# Add log spend for visualization
df["log_spend"] = np.log1p(df["weekly_spend"])

# Ensure week_of_year only 8 weeks (sanity)
# print(sorted(df["week_of_year"].dropna().unique()))

# Sort for baseline shift
df = df.sort_values(["store_id", "week_of_year"]).reset_index(drop=True)

# =========================

In [9]:
# FIG 1: Scatter (log spend vs ranking) + smooth trend
# =========================
plt.figure()
plt.scatter(df["log_spend"], df["ranking_score_weekly"], alpha=0.35)
plt.xlabel("log(1 + weekly_spend)")
plt.ylabel("ranking_score_weekly (0–1)")
plt.title("Scatter: weekly spend vs ranking score (Jan–Feb, store-week)")

# Smooth trend line (rolling mean over sorted x)
tmp = df[["log_spend", "ranking_score_weekly"]].sort_values("log_spend").reset_index(drop=True)
window = max(50, int(len(tmp) * 0.02))  # ~2% of points, min 50
tmp["smooth"] = tmp["ranking_score_weekly"].rolling(window=window, center=True).mean()
plt.plot(tmp["log_spend"], tmp["smooth"])

plt.tight_layout()
fig1_path = os.path.join(OUT_DIR, "01_scatter_log_spend_vs_ranking.png")
plt.savefig(fig1_path, dpi=200)
plt.close()


In [10]:
# =========================
# FIG 2: Spend buckets (deciles) -> mean ranking
# =========================
df["spend_decile"] = pd.qcut(df["weekly_spend"], q=10, duplicates="drop")

bucket = (
    df.groupby("spend_decile", observed=True)
      .agg(mean_ranking=("ranking_score_weekly", "mean"),
           mean_spend=("weekly_spend", "mean"),
           n=("ranking_score_weekly", "size"))
      .reset_index()
)

# Keep bucket order as it appears (qcut already ordered)
bucket["bucket_idx"] = range(1, len(bucket) + 1)

plt.figure()
plt.plot(bucket["bucket_idx"], bucket["mean_ranking"], marker="o")
plt.xticks(bucket["bucket_idx"], [f"D{i}" for i in bucket["bucket_idx"]])
plt.xlabel("Spend bucket (deciles; low -> high)")
plt.ylabel("Average ranking_score_weekly")
plt.title("Spend buckets: average ranking score by spend decile")
plt.tight_layout()

fig2_path = os.path.join(OUT_DIR, "02_spend_buckets_mean_ranking.png")
plt.savefig(fig2_path, dpi=200)
plt.close()

In [11]:
# =========================
# FIG 3: Baseline segmentation by previous-week ranking (low/mid/high)
# =========================
df["prev_week_ranking"] = df.groupby("store_id")["ranking_score_weekly"].shift(1)
seg = df.dropna(subset=["prev_week_ranking"]).copy()

# Tertiles of prev_week_ranking
seg["baseline_cohort"] = pd.qcut(
    seg["prev_week_ranking"],
    q=[0, 1/3, 2/3, 1.0],
    labels=["low_baseline", "mid_baseline", "high_baseline"]
)

# Use the same spend deciles definition (recompute on seg for simplicity)
seg["spend_decile"] = pd.qcut(seg["weekly_spend"], q=10, duplicates="drop")

cohort_bucket = (
    seg.groupby(["baseline_cohort", "spend_decile"], observed=True)
       .agg(mean_ranking=("ranking_score_weekly", "mean"))
       .reset_index()
)

plt.figure()
for cohort_name in ["low_baseline", "mid_baseline", "high_baseline"]:
    tmp = cohort_bucket[cohort_bucket["baseline_cohort"] == cohort_name].copy()
    tmp = tmp.sort_values("spend_decile")
    plt.plot(range(1, len(tmp) + 1), tmp["mean_ranking"], marker="o", label=cohort_name)

plt.xticks(range(1, len(tmp) + 1), [f"D{i}" for i in range(1, len(tmp) + 1)])
plt.xlabel("Spend bucket (deciles; low -> high)")
plt.ylabel("Average ranking_score_weekly")
plt.title("Baseline segmentation: spend -> ranking by previous-week cohort")
plt.legend()
plt.tight_layout()

fig3_path = os.path.join(OUT_DIR, "03_baseline_cohort_spend_vs_ranking.png")
plt.savefig(fig3_path, dpi=200)
plt.close()

In [12]:
print("Saved charts to:")
print(fig1_path)
print(fig2_path)
print(fig3_path)
print("Also saved bucket summary table to:")
print(bucket_table_path)

Saved charts to:
/Users/zhangnan/Downloads/assets/01_scatter_log_spend_vs_ranking.png
/Users/zhangnan/Downloads/assets/02_spend_buckets_mean_ranking.png
/Users/zhangnan/Downloads/assets/03_baseline_cohort_spend_vs_ranking.png
Also saved bucket summary table to:
/Users/zhangnan/Downloads/assets/02_spend_buckets_table.csv
