In [11]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

print("CWD:", os.getcwd())


CWD: /Users/ashmitasahu/bootcamp_Ashmita_Sahu/notebooks


In [13]:
from pathlib import Path
import os

# Show current working directory so we know where the notebook is running
cwd = Path.cwd().resolve()
print("CWD:", cwd)

# Robustly infer repo root: if we're inside .../notebooks, go up one level
repo_root = cwd
if repo_root.name == "notebooks":
    repo_root = repo_root.parent
elif "notebooks" in repo_root.parts:
    # If the CWD is nested under notebooks (rare), climb until we exit it
    while repo_root.name != "notebooks" and repo_root.parent != repo_root:
        repo_root = repo_root.parent
    repo_root = repo_root.parent

# Allow env override for data dir (DATA_DIR can be relative to repo_root)
data_dir_env = os.getenv("DATA_DIR")
if data_dir_env:
    data_dir = Path(data_dir_env)
    if not data_dir.is_absolute():
        data_dir = (repo_root / data_dir).resolve()
else:
    data_dir = (repo_root / "data").resolve()

csv_path = data_dir / "starter_data.csv"
print("Repo root:", repo_root)
print("CSV path:", csv_path)
assert csv_path.exists(), f"Missing file at {csv_path}"


CWD: /Users/ashmitasahu/bootcamp_Ashmita_Sahu/notebooks
Repo root: /Users/ashmitasahu/bootcamp_Ashmita_Sahu
CSV path: /Users/ashmitasahu/bootcamp_Ashmita_Sahu/data/starter_data.csv


In [14]:
import pandas as pd

df = pd.read_csv(csv_path)
display(df.head())
df.info()


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes


In [15]:
import time

n = 1_000_00  # adjust size if needed
x = np.random.rand(n)

t0 = time.perf_counter()
y_vec = x**2 + 2*x + 1
t1 = time.perf_counter()

t2 = time.perf_counter()
y_loop = np.empty_like(x)
for i, xi in enumerate(x):
    y_loop[i] = xi*xi + 2*xi + 1
t3 = time.perf_counter()

print(f"Vectorized: {t1 - t0:.4f}s | Loop: {t3 - t2:.4f}s")
print("Equal?", np.allclose(y_vec, y_loop))


Vectorized: 0.0162s | Loop: 0.0530s
Equal? True


In [16]:
def describe_numeric(_df: pd.DataFrame) -> pd.DataFrame:
    try:
        return _df.describe(numeric_only=True)  # pandas >= 1.5
    except TypeError:
        return _df.select_dtypes(include="number").describe()  # older pandas

desc = describe_numeric(df)
display(desc)


Unnamed: 0,value
count,10.0
mean,17.6
std,7.381659
min,10.0
25%,12.25
50%,14.5
75%,23.25
max,30.0


In [17]:
cat_cols = list(df.select_dtypes(include=["object", "category"]).columns)
by_col = cat_cols[0] if cat_cols else None
print("Grouping by:", by_col)

if by_col:
    grp = df.groupby(by_col).agg(["count", "mean", "median", "std", "min", "max"])
    display(grp)
else:
    grp = None


Grouping by: category


TypeError: agg function failed [how->mean,dtype->object]

In [18]:
import pandas as pd

# 1) Start with truly numeric dtypes
num_cols = list(df.select_dtypes(include="number").columns)

# 2) (Optional) Try to coerce object columns that look numeric
if not num_cols:
    # or even if you want to be aggressive:
    obj_cols = df.select_dtypes(include="object").columns
    for c in obj_cols:
        coerced = pd.to_numeric(df[c], errors="coerce")
        # if most entries became numbers, keep the coerced version
        if coerced.notna().mean() > 0.9:
            df[c] = coerced
    num_cols = list(df.select_dtypes(include="number").columns)

print("Numeric columns for agg:", num_cols)


Numeric columns for agg: ['value']


In [19]:
# Choose a categorical column to group by
cat_cols = list(df.select_dtypes(include=["object", "category"]).columns)
by_col = cat_cols[0] if cat_cols else None
print("Grouping by:", by_col)

if by_col and num_cols:
    grp = df.groupby(by_col)[num_cols].agg(["count", "mean", "median", "std", "min", "max"])
    display(grp)
else:
    grp = None
    print("No valid groupby possible (missing category column or numeric columns).")


Grouping by: category


Unnamed: 0_level_0,value,value,value,value,value,value
Unnamed: 0_level_1,count,mean,median,std,min,max
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,4,11.5,11.5,1.290994,10,13
B,3,15.666667,15.0,2.081666,14,18
C,3,27.666667,28.0,2.516611,25,30


In [20]:
# Save summaries
desc.to_csv("data/processed/summary_numeric.csv")
if grp is not None:
    grp.to_csv("data/processed/summary_by_category.csv")


In [21]:
out_dir = repo_root / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)

desc.to_csv(out_dir / "summary_numeric.csv", index=True)

if grp is not None:
    grp.to_csv(out_dir / "summary_by_category.csv", index=True)

print("Saved to:", out_dir)



Saved to: /Users/ashmitasahu/bootcamp_Ashmita_Sahu/data/processed


In [22]:
import matplotlib.pyplot as plt

num_cols = list(df.select_dtypes(include="number").columns)
if num_cols:
    col = num_cols[0]
    plt.figure()
    df[col].plot(kind="hist", bins=30, title=f"Distribution of {col}")
    plt.xlabel(col)
    plt.tight_layout()
    plt.savefig(out_dir / "histogram.png", dpi=150)
    plt.close()
    print("Plot saved to:", out_dir / "histogram.png")
else:
    print("No numeric columns found for plotting.")


Plot saved to: /Users/ashmitasahu/bootcamp_Ashmita_Sahu/data/processed/histogram.png


In [24]:
stats = get_summary_stats(df, by=by_col)
(stats["numeric"]).to_csv(out_dir / "summary_numeric.csv", index=True)
if "grouped" in stats:
    stats["grouped"].to_csv(out_dir / "summary_by_category.csv", index=True)
print("Stats saved.")


TypeError: agg function failed [how->mean,dtype->object]