# Data Assignment Template

_This notebook walks through the whole workflow: NumPy ops, data loading, summary stats, saving outputs, plotting, and reusable utilities._

## 0. Setup
- Creates folders `data/processed/` and `src/` (if missing)
- Imports core libraries
- Sets a flexible CSV path resolver: prefers `data/starter_data.csv`, falls back to `/mnt/data/starter_data.csv` (for this environment)

In [None]:
# Standard imports
import os, sys, time, io
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Create folders for outputs and code
Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03").mkdir(parents=True, exist_ok=True)
Path("src").mkdir(parents=True, exist_ok=True)

# Helper: robust CSV path resolver
def resolve_csv_path():
    candidates = [
        Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03/starter_data.csv"),
        Path("/mnt//Users/hust/bootcamp_zheyu_dong/homework/homework03/starter_data.csv"),
    ]
    for p in candidates:
        if p.exists():
            return str(p)
    raise FileNotFoundError("starter_data.csv not found in data/ or /mnt/data/.")

print("Setup complete.")

## 1. NumPy Operations
- Create an array and perform elementwise operations
- Compare **loop vs vectorized** performance

In [None]:
from time import perf_counter
import numpy as np
import pandas as pd

arr = np.arange(100_000, dtype=float)

# Vectorized
t0 = perf_counter()
vec = arr * 2 + 1
t_vec = perf_counter() - t0

# Pure Python loop
t0 = perf_counter()
out = []
for x in arr:
    out.append(x * 2 + 1)
out = np.array(out, dtype=float)
t_loop = perf_counter() - t0

timing_df = pd.DataFrame({"method": ["vectorized","loop"], "seconds": [t_vec, t_loop]})
print(timing_df)
timing_df

## 2. Load Dataset & Inspect
- Load `starter_data.csv`
- Inspect with `.info()` and `.head()`

In [None]:
csv_path = resolve_csv_path()
df = pd.read_csv(csv_path)

print(f"CSV loaded from: {csv_path}\n")

# .info() to stdout
buf = io.StringIO()
df.info(buf=buf)
print(buf.getvalue())

# Show first few rows in a table
df.head()

## 3. Summary Statistics & GroupBy Aggregation
- `.describe()` for numeric columns
- `groupby()` by a categorical column if available; otherwise derive one from the first numeric column (low/mid/high)

In [None]:
# Numeric describe
numeric_df = df.select_dtypes(include="number")
num_summary = numeric_df.describe()
num_summary

In [None]:
# Smart groupby: prefer 'category' column, then any object/category column, else derive from first numeric column
cat_col = None
for c in df.columns:
    if c.lower() == "category":
        cat_col = c; break

if cat_col is None:
    obj_cols = [c for c in df.columns if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
    if obj_cols:
        cat_col = obj_cols[0]

if cat_col is None and not numeric_df.empty:
    first_num = numeric_df.columns[0]
    try:
        df["_derived_cat"] = pd.qcut(df[first_num], q=3, labels=["low","mid","high"])
    except Exception:
        df["_derived_cat"] = pd.cut(df[first_num], bins=3, labels=["low","mid","high"])
    cat_col = "_derived_cat"

grouped = (df.groupby(cat_col).agg(["count","mean"])
           if cat_col is not None else
           numeric_df.agg(["count","mean"]))

print("Grouping column:", cat_col)
grouped

## 4. Save Outputs + Basic Plot
- Save summary to `data/processed/summary.csv` & `.json`
- Save grouped results to `data/processed/grouped_summary.csv`
- **Bonus**: histogram of the first numeric column (saved to `data/processed/basic_plot.png`)

In [None]:
# Save outputs
from pathlib import Path
summary_csv = Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03/summary.csv")
summary_json = Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03/summary.json")
grouped_csv  = Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03/grouped_summary.csv")
plot_path    = Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03/basic_plot.png")

num_summary.to_csv(summary_csv, index=True)
num_summary.to_json(summary_json)
grouped.to_csv(grouped_csv, index=True)

# Basic plot (matplotlib, single chart, no styles/colors set)
if not numeric_df.empty:
    plt.figure()
    numeric_df.iloc[:,0].plot(kind="hist", title=f"Histogram of {numeric_df.columns[0]}")
    plt.xlabel(numeric_df.columns[0])
    plt.tight_layout()
    plt.savefig(plot_path)
    plt.close()
    print(f"Plot saved to: {plot_path}")
else:
    print("No numeric columns found. Plot skipped.")

print(f"Saved: {summary_csv}\nSaved: {summary_json}\nSaved: {grouped_csv}")

## 5. Reusable Functions
- Write a utility function `get_summary_stats(df)`
- **Bonus**: move to `src/utils.py` and import back

In [None]:
# In-notebook utility
def get_summary_stats(df: pd.DataFrame) -> pd.DataFrame:
    \"\"\"Return describe() for numeric columns only.\"\"\"
    return df.select_dtypes(include="number").describe()

get_summary_stats(df)

In [None]:
# Bonus: write utils module and import
utils_code = r'''
import pandas as pd

def get_summary_stats(df: pd.DataFrame) -> pd.DataFrame:
    return df.select_dtypes(include="number").describe()

def infer_groupby_and_agg(df: pd.DataFrame) -> pd.DataFrame:
    cat_col = None
    for c in df.columns:
        if c.lower() == "category":
            cat_col = c; break
    if cat_col is None:
        obj_cols = [c for c in df.columns if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
        if obj_cols:
            cat_col = obj_cols[0]
    if cat_col is None:
        numeric_df = df.select_dtypes(include="number")
        if not numeric_df.empty:
            first_num = numeric_df.columns[0]
            try:
                df = df.copy()
                df["_derived_cat"] = pd.qcut(df[first_num], q=3, labels=["low","mid","high"])
            except Exception:
                df = df.copy()
                df["_derived_cat"] = pd.cut(df[first_num], bins=3, labels=["low","mid","high"])
            cat_col = "_derived_cat"
    return (df.groupby(cat_col).agg(["count","mean"])
            if cat_col is not None else
            df.select_dtypes(include="number").agg(["count","mean"]))
'''
with open("src/utils.py", "w", encoding="utf-8") as f:
    f.write(utils_code)

# Make sure we can import from project root
if "." not in sys.path:
    sys.path.append(".")

from src.utils import get_summary_stats as util_get_summary_stats, infer_groupby_and_agg

util_get_summary_stats(df), infer_groupby_and_agg(df)

## 6. Wrap-up
- Quick recap of what was saved and where

In [None]:
print("All done ✅")
print("Outputs are in /Users/hust/bootcamp_zheyu_dong/homework/homework03/:")
from pathlib import Path
for p in Path("/Users/hust/bootcamp_zheyu_dong/homework/homework03").glob("*"):
    print(" -", p)