In [2]:
#!/usr/bin/env python3
"""
Stage 08 — Exploratory Data Analysis (EDA)
- Loads processed dataset (prefers Stage 06 cleaned file; else first CSV in processed dir)
- Profiles numeric & categorical columns
- Plots ≥3 distributions and ≥2 relationships
- Optional correlation heatmap
- Notes skew/outliers/seasonality/structure
- Writes Markdown report + saves figures

Outputs:
  notebooks/eda_report.md
  notebooks/figures/*.png
"""

import os
import io
import glob
import textwrap
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv

warnings.filterwarnings("ignore", category=FutureWarning)

# -----------------------------
# 0) Paths & loading helpers
# -----------------------------
load_dotenv()

# Prefer env from earlier stages; fallback to your Windows path you shared
DATA_DIR_PROCESSED = os.getenv(
    "DATA_DIR_PROCESSED",
    r"C:\Users\sarda\Desktop\bootcamp_darshit_sarda\homework\data\processed",
)
REPO_NOTEBOOKS = os.path.join(os.getcwd(), "notebooks")
FIG_DIR = os.path.join(REPO_NOTEBOOKS, "figures")
os.makedirs(REPO_NOTEBOOKS, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)

# Preferred file from Stage 06 (cleaned dataset), else first CSV
PREFERRED = os.path.join(DATA_DIR_PROCESSED, "amazon_bestsellers_2025_cleaned.csv")
if os.path.exists(PREFERRED):
    DATA_PATH = PREFERRED
else:
    csvs = sorted(glob.glob(os.path.join(DATA_DIR_PROCESSED, "*.csv")))
    if not csvs:
        raise FileNotFoundError(
            f"No CSVs found in processed dir.\nChecked: {DATA_DIR_PROCESSED}\n"
            "Make sure Stage 05/06 produced a CSV in data/processed."
        )
    DATA_PATH = csvs[0]

print(f"[EDA] Using dataset: {DATA_PATH}")

# -----------------------------
# 1) Load dataset
# -----------------------------
df = pd.read_csv(DATA_PATH)

# Try to parse date-like columns by name
DATE_CANDIDATES = ["date", "Date", "timestamp", "Timestamp", "datetime", "Datetime"]
for col in df.columns:
    if any(key in str(col).lower() for key in ["date", "time"]):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                parsed = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
                # Consider it a date column if at least 80% parsed
                if parsed.notna().mean() > 0.8:
                    df[col] = parsed
            except Exception:
                pass

# Identify a primary date column if present
date_col = None
for c in df.columns:
    if np.issubdtype(df[c].dtype, np.datetime64):
        date_col = c
        break

# -----------------------------
# 2) Profile numeric/categorical
# -----------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number, "datetime64[ns]"]).columns.tolist()

# Capture df.info() into a string
buf = io.StringIO()
df.info(buf=buf)
info_text = buf.getvalue()

describe_num = df[numeric_cols].describe().T if numeric_cols else pd.DataFrame()
describe_all = df.describe(include="all").T
missing_counts = df.isna().sum().sort_values(ascending=False)

# -----------------------------
# 3) Plots — distributions
# -----------------------------
def safe_filename(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in name)

# Pick up to 3 numeric columns for hist & box
dist_cols = numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols

saved_figs = []

# Histograms (≥3 if possible)
if dist_cols:
    for col in dist_cols:
        plt.figure(figsize=(6, 4))
        data = df[col].dropna()
        plt.hist(data, bins=30)
        plt.title(f"Distribution — {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.tight_layout()
        out = os.path.join(FIG_DIR, f"hist_{safe_filename(col)}.png")
        plt.savefig(out, dpi=150)
        plt.close()
        saved_figs.append(out)

# Boxplots for same cols
if dist_cols:
    for col in dist_cols:
        plt.figure(figsize=(6, 2.8))
        data = df[col].dropna()
        plt.boxplot(data, vert=False)
        plt.title(f"Boxplot — {col}")
        plt.xlabel(col)
        plt.tight_layout()
        out = os.path.join(FIG_DIR, f"box_{safe_filename(col)}.png")
        plt.savefig(out, dpi=150)
        plt.close()
        saved_figs.append(out)

# -----------------------------
# 4) Bivariate visuals (≥2)
# -----------------------------
# Scatter: pick two numeric columns if available
scatter_path = None
if len(numeric_cols) >= 2:
    xcol, ycol = numeric_cols[0], numeric_cols[1]
    plt.figure(figsize=(6, 4))
    plt.scatter(df[xcol], df[ycol], alpha=0.7)
    plt.title(f"Scatter — {xcol} vs {ycol}")
    plt.xlabel(xcol)
    plt.ylabel(ycol)
    plt.tight_layout()
    scatter_path = os.path.join(FIG_DIR, f"scatter_{safe_filename(xcol)}_vs_{safe_filename(ycol)}.png")
    plt.savefig(scatter_path, dpi=150)
    plt.close()
    saved_figs.append(scatter_path)

# Line/time series: if date_col exists + at least one numeric col
ts_path = None
if date_col and len(numeric_cols) >= 1:
    y_ts = numeric_cols[0]
    # Sort by date for clean plotting
    df_ts = df[[date_col, y_ts]].dropna().sort_values(by=date_col)
    if not df_ts.empty:
        plt.figure(figsize=(8, 4))
        plt.plot(df_ts[date_col], df_ts[y_ts])
        plt.title(f"Time Series — {y_ts} over {date_col}")
        plt.xlabel(date_col)
        plt.ylabel(y_ts)
        plt.xticks(rotation=20)
        plt.tight_layout()
        ts_path = os.path.join(FIG_DIR, f"timeseries_{safe_filename(y_ts)}.png")
        plt.savefig(ts_path, dpi=150)
        plt.close()
        saved_figs.append(ts_path)

# -----------------------------
# 5) Correlation heatmap (optional)
# -----------------------------
heatmap_path = None
if len(numeric_cols) >= 2:
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(0.9 * len(numeric_cols) + 3, 0.9 * len(numeric_cols) + 3))
    im = plt.imshow(corr, interpolation="nearest")
    plt.title("Correlation Heatmap")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45, ha="right")
    plt.yticks(range(len(numeric_cols)), numeric_cols)
    plt.tight_layout()
    heatmap_path = os.path.join(FIG_DIR, "correlation_heatmap.png")
    plt.savefig(heatmap_path, dpi=150)
    plt.close()
    saved_figs.append(heatmap_path)
else:
    corr = pd.DataFrame()

# -----------------------------
# 6) Automated insights (“so what?”)
# -----------------------------
insights = []

# 6.1 Most skewed numeric features
if numeric_cols:
    skews = df[numeric_cols].skew(numeric_only=True).abs().sort_values(ascending=False)
    top_skew = skews.head(3)
    if not top_skew.empty:
        skew_lines = [f"- **{idx}** | |skew| = {val:.2f}" for idx, val in top_skew.items()]
        insights.append("**Skewness:**\n" + "\n".join(skew_lines))

# 6.2 Highest correlation pairs
if not corr.empty and corr.shape[0] >= 2:
    corr_upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    corr_pairs = corr_upper.unstack().dropna().abs().sort_values(ascending=False)
    top_corrs = corr_pairs.head(3)
    if not top_corrs.empty:
        lines = []
        for (a, b), v in top_corrs.items():
            lines.append(f"- **{a} ~ {b}**: |r| = {v:.2f}")
        insights.append("**Strongest correlations (abs):**\n" + "\n".join(lines))

# 6.3 Missingness
if missing_counts.sum() > 0:
    miss_top = missing_counts[missing_counts > 0].head(5)
    lines = [f"- **{idx}**: {val} missing" for idx, val in miss_top.items()]
    insights.append("**Missing values (top):**\n" + "\n".join(lines))

# 6.4 Time series note
if date_col:
    insights.append(f"**Temporal structure:** Found a date column (**{date_col}**). Consider seasonality/trends.")

# Fallback insight if nothing above
if not insights:
    insights.append("**Data appears relatively clean with limited skew/missingness.**")

# Top 3 insights selection
top3 = insights[:3]

# -----------------------------
# 7) Implications for next step
# -----------------------------
implications = [
    "- Apply transformations to high-skew features (e.g., log/Box-Cox) before modeling.",
    "- Address multicollinearity (remove or combine highly correlated features) to stabilize models.",
    "- Impute or drop columns with substantial missingness; document rationale.",
]
if date_col:
    implications.append("- Engineer calendar/time features (month, quarter, day-of-week, lag/rolling stats).")

# -----------------------------
# 8) Assemble Markdown report
# -----------------------------
ts = datetime.now().strftime("%Y-%m-%d %H:%M")
report_path = os.path.join(REPO_NOTEBOOKS, "eda_report.md")

def df_to_markdown_table(d: pd.DataFrame, max_rows=30, max_cols=12) -> str:
    if d is None or d.empty:
        return "_(empty)_"
    d2 = d.copy()
    if d2.shape[0] > max_rows:
        d2 = d2.head(max_rows)
    if d2.shape[1] > max_cols:
        d2 = d2.iloc[:, :max_cols]
    return d2.to_markdown()

md = []

md.append(f"# Exploratory Data Analysis (Stage 08)\n")
md.append(f"_Generated: {ts}_\n")
md.append(f"**Dataset:** `{DATA_PATH}`  \n")
md.append(f"**Rows × Cols:** {df.shape[0]} × {df.shape[1]}\n")

md.append("## 1) Statistical Summaries")
md.append("### `df.info()`")
md.append("```text\n" + info_text.strip() + "\n```")

md.append("### `df.describe()` (numeric)")
md.append(df_to_markdown_table(describe_num))

md.append("### Missing value counts")
md.append(df_to_markdown_table(missing_counts.to_frame(name="missing")))

md.append("### Column types")
md.append(f"- Numeric columns ({len(numeric_cols)}): {', '.join(numeric_cols) if numeric_cols else '—'}")
md.append(f"- Categorical columns ({len(categorical_cols)}): {', '.join(categorical_cols) if categorical_cols else '—'}")
md.append(f"- Date column: {date_col if date_col else '—'}")

md.append("## 2) Distributions (Histograms & Boxplots)")
for p in saved_figs:
    if os.path.basename(p).startswith("hist_") or os.path.basename(p).startswith("box_"):
        md.append(f"![{os.path.basename(p)}](figures/{os.path.basename(p)})")

md.append("## 3) Bivariate Visuals")
if scatter_path:
    md.append(f"**Scatter plot:**\n\n![{os.path.basename(scatter_path)}](figures/{os.path.basename(scatter_path)})")
else:
    md.append("_Not enough numeric columns for scatter plot._")

if ts_path:
    md.append(f"**Time series plot:**\n\n![{os.path.basename(ts_path)}](figures/{os.path.basename(ts_path)})")
else:
    md.append("_No date column or suitable numeric series for time plot._")

if heatmap_path:
    md.append("## 4) Correlation Heatmap (Optional)")
    md.append(f"![{os.path.basename(heatmap_path)}](figures/{os.path.basename(heatmap_path)})")

md.append("## 5) Findings: Skew, Outliers, Seasonality, Structure")
md.extend([f"- {line}" for line in insights])

md.append("## 6) Implications for Next Step")
md.extend([f"- {line}" for line in implications])

md.append("## Top 3 Insights")
for i, line in enumerate(top3, 1):
    md.append(f"{i}. {line}")

md.append("## Assumptions & Risks")
md.append(
    textwrap.dedent(
        """
        - Missingness is assumed to be at random; if not, imputation may bias results.
        - High correlations can inflate variance of coefficients; monitor VIF or use regularization.
        - If outliers are business-meaningful events, trimming/winsorizing could harm signal.
        - Correlation ≠ causation; domain validation is required before feature removal.
        """
    ).strip()
)

with open(report_path, "w", encoding="utf-8") as f:
    f.write("\n\n".join(md))

print(f"\n✅ EDA complete.")
print(f"   Report: {report_path}")
print(f"   Figures: {FIG_DIR}")


[EDA] Using dataset: C:\Users\sarda\Desktop\bootcamp_darshit_sarda\homework\data\processed\amazon_bestsellers_2025_cleaned.csv

✅ EDA complete.
   Report: c:\Users\sarda\Desktop\bootcamp_darshit_sarda\homework\notebooks\eda_report.md
   Figures: c:\Users\sarda\Desktop\bootcamp_darshit_sarda\homework\notebooks\figures
