# Amazon Reviews — EDA (neg / neu / pos)

Exploratory Data Analysis for the sentiment dataset used in this project.

**What you get here**
- Sanity checks (versions, device)
- Load `train/val/test.parquet`
- Label distribution (per split and overall)
- Text length analysis (words/chars) with plots
- Basic data quality checks (empties/duplicates)

> ⚠️ **Prereq:** run `python src/data.py` first so that `data/train.parquet`, `data/val.parquet`, and `data/test.parquet` exist.


In [None]:
# Imports & versions
from pathlib import Path
import os, json, math, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from IPython.display import display, Markdown

try:
    import torch
    mps_ok = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
except Exception:
    torch = None
    mps_ok = False

print("Python:", f"{pd.util.version.get_version()} (pandas)")
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
if torch is not None:
    print("torch:", torch.__version__, "| MPS available:", mps_ok)
else:
    print("torch: not installed in this kernel")

# Matplotlib defaults
plt.rcParams["figure.figsize"] = (7.0, 4.0)
plt.rcParams["axes.grid"] = True
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False


In [None]:
# Paths & presence checks
ROOT = Path("..").resolve()
DATA_DIR = ROOT / "data"
REPORT_DIR = ROOT / "reports" / "figures"
REPORT_DIR.mkdir(parents=True, exist_ok=True)

required = [DATA_DIR/"train.parquet", DATA_DIR/"val.parquet", DATA_DIR/"test.parquet"]
missing = [p for p in required if not p.exists()]
if missing:
    print("❌ Missing files:")
    for p in missing:
        print(" -", p)
    print("\nRun first:  python src/data.py")
    raise SystemExit(0)
else:
    print("✅ Parquet files found.")


In [None]:
# Helpers to locate text/label columns across variants
TEXT_CANDS = ["text", "reviewText", "review_body", "review_text", "summary"]

def get_text_col(df):
    for c in TEXT_CANDS:
        if c in df.columns:
            return c
    # fallback: first string-like column
    for c in df.columns:
        if pd.api.types.is_string_dtype(df[c]):
            return c
    raise KeyError("Couldn't find a text column. Expected one of: " + ", ".join(TEXT_CANDS))

def get_label_series(df):
    if "label_str" in df.columns:
        return df["label_str"]
    if "sentiment" in df.columns:
        return df["sentiment"]
    if "label" in df.columns:
        id2lbl = {0:"neg", 1:"neu", 2:"pos"}
        return df["label"].map(id2lbl).fillna(df["label"].astype(str))
    raise KeyError("Couldn't find a label column (label, label_str, or sentiment).")

def read_split(name):
    path = DATA_DIR / f"{name}.parquet"
    return pd.read_parquet(path)

splits = {s: read_split(s) for s in ("train","val","test")}
for k, df in splits.items():
    print(k.upper(), df.shape)
    display(df.head(3))

TEXT_COL = get_text_col(splits["train"])  # consistent across splits
print("\nUsing text column:", TEXT_COL)


## Label distribution
Per split and combined. Labels expected: `neg`, `neu`, `pos`.

In [None]:
def label_counts(df):
    s = get_label_series(df)
    vc = s.value_counts().reindex(["neg","neu","pos"], fill_value=0)
    return vc

dist_tbl = []
for name, df in splits.items():
    vc = label_counts(df)
    total = int(vc.sum())
    dist_tbl.append(pd.DataFrame({
        "split": name,
        "neg": [vc.get("neg",0)],
        "neu": [vc.get("neu",0)],
        "pos": [vc.get("pos",0)],
        "total": [total],
        "neg_%": [vc.get("neg",0)/total*100],
        "neu_%": [vc.get("neu",0)/total*100],
        "pos_%": [vc.get("pos",0)/total*100],
    }))

all_df = pd.concat([splits["train"], splits["val"], splits["test"]], ignore_index=True)
vc_all = label_counts(all_df)
total_all = int(vc_all.sum())
dist_tbl.append(pd.DataFrame({
    "split": ["all"],
    "neg": [vc_all.get("neg",0)],
    "neu": [vc_all.get("neu",0)],
    "pos": [vc_all.get("pos",0)],
    "total": [total_all],
    "neg_%": [vc_all.get("neg",0)/total_all*100],
    "neu_%": [vc_all.get("neu",0)/total_all*100],
    "pos_%": [vc_all.get("pos",0)/total_all*100],
}))

dist = pd.concat(dist_tbl, ignore_index=True)
dist = dist[["split","total","neg","neu","pos","neg_%","neu_%","pos_%"]]
display(dist)

# Bar plot per split
fig, ax = plt.subplots(figsize=(7,4))
X = np.arange(len(splits))
w = 0.25
neg = [label_counts(splits[s]).get("neg",0) for s in ("train","val","test")]
neu = [label_counts(splits[s]).get("neu",0) for s in ("train","val","test")]
pos = [label_counts(splits[s]).get("pos",0) for s in ("train","val","test")]
ax.bar(X - w, neg, width=w, label="neg")
ax.bar(X + 0.0, neu, width=w, label="neu")
ax.bar(X + w, pos, width=w, label="pos")
ax.set_xticks(X, ["train","val","test"]) 
ax.set_ylabel("count")
ax.set_title("Label distribution per split")
ax.legend()
plt.tight_layout()
plt.savefig(REPORT_DIR/"eda_label_distribution.png", dpi=160)
plt.show()
print("Saved:", REPORT_DIR/"eda_label_distribution.png")


## Text length analysis
Histograms of word counts and char counts (per split & per class).

In [None]:
def word_count(s):
    if isinstance(s, str):
        return len(s.split())
    return 0

def char_count(s):
    if isinstance(s, str):
        return len(s)
    return 0

for name, df in splits.items():
    wc = df[TEXT_COL].map(word_count).clip(upper=512)
    cc = df[TEXT_COL].map(char_count).clip(upper=3000)
    fig, axes = plt.subplots(1,2, figsize=(10,3.8))
    axes[0].hist(wc, bins=40, color="#4C78A8")
    axes[0].set_title(f"{name} — words (clipped)")
    axes[0].set_xlabel("words")
    axes[0].set_ylabel("count")
    axes[1].hist(cc, bins=40, color="#F58518")
    axes[1].set_title(f"{name} — chars (clipped)")
    axes[1].set_xlabel("chars")
    for ax in axes:
        ax.grid(True, alpha=0.25)
    plt.tight_layout()
    out = REPORT_DIR / f"eda_len_{name}.png"
    plt.savefig(out, dpi=160)
    plt.show()
    print("Saved:", out)

# Per-class word count on ALL
all_df = pd.concat([splits["train"], splits["val"], splits["test"]], ignore_index=True)
all_df = all_df.copy()
all_df["label_plot"] = get_label_series(all_df)
all_df["wc"] = all_df[TEXT_COL].map(word_count).clip(upper=512)

fig, ax = plt.subplots(figsize=(7,4))
for lbl, color in zip(["neg","neu","pos"], ["#4C78A8", "#54A24B", "#E45756"]):
    wc_lbl = all_df.loc[all_df["label_plot"]==lbl, "wc"]
    ax.hist(wc_lbl, bins=40, alpha=0.6, label=lbl, color=color)
ax.set_title("Word count by class (clipped)")
ax.set_xlabel("words")
ax.set_ylabel("count")
ax.legend()
plt.tight_layout()
out = REPORT_DIR/"eda_len_by_class.png"
plt.savefig(out, dpi=160)
plt.show()
print("Saved:", out)


## Data quality checks
- Empty texts
- Duplicates (by text)
- Basic length stats (words/chars)

In [None]:
def is_empty(s):
    if not isinstance(s, str):
        return True
    return len(s.strip()) == 0

def quality_report(df, name):
    text_col = TEXT_COL
    n = len(df)
    empties = df[text_col].map(is_empty).sum()
    dups = df.duplicated(subset=[text_col]).sum()
    wc = df[text_col].map(lambda x: len(str(x).split()))
    cc = df[text_col].map(lambda x: len(str(x)))
    rep = {
        "split": name,
        "rows": int(n),
        "empty_text": int(empties),
        "duplicates_by_text": int(dups),
        "wc_mean": float(wc.mean()),
        "wc_p95": float(wc.quantile(0.95)),
        "cc_mean": float(cc.mean()),
        "cc_p95": float(cc.quantile(0.95)),
    }
    return rep

qr = pd.DataFrame([quality_report(splits[s], s) for s in ("train","val","test")])
display(qr)

with open(REPORT_DIR.parent/"eda_quality.json", "w") as f:
    json.dump(qr.to_dict(orient="records"), f, indent=2)
print("Saved:", REPORT_DIR.parent/"eda_quality.json")


## (Optional) Peek metrics JSON already produced by scripts
Loads `metrics/baseline_test.json` and `metrics/bert_test.json` if present to show test scores.

In [None]:
METR_DIR = ROOT/"metrics"
rows = []
def pull(json_path, name):
    try:
        m = json.load(open(json_path))
        acc = m.get("report",{}).get("accuracy", None)
        f1m = m.get("report",{}).get("macro avg",{}).get("f1-score", None)
        return {"model": name, "Test ACC": acc, "Test F1-macro": f1m}
    except Exception:
        return None

b = pull(METR_DIR/"baseline_test.json", "TF-IDF + Logistic")
d = pull(METR_DIR/"bert_test.json", "DistilBERT")
for r in (b,d):
    if r: rows.append(r)

if rows:
    display(pd.DataFrame(rows))
else:
    print("No metrics JSON found yet — run training scripts to generate.")
