# HW03 — Python Fundamentals
NumPy ops, pandas loading/inspection, summary stats, groupby, save outputs, reusable functions.


In [2]:
from pathlib import Path
import sys


NB_DIR = Path.cwd().resolve()
HW3_ROOT = NB_DIR.parent                    
SRC_DIR = HW3_ROOT / "src"
DATA_DIR = HW3_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
CSV_PATH = DATA_DIR / "starter_data.csv"


sys.path.insert(0, str(SRC_DIR))

from hw3_utils import (
    get_summary_stats,          
    save_outputs,
    find_first_categorical,
    basic_plot_by_category_mean,
)

import numpy as np
import pandas as pd
from IPython.display import display


import hw3_utils
print("hw3_utils loaded from:", hw3_utils.__file__)
print("CSV exists:", CSV_PATH.exists())


hw3_utils loaded from: /Users/chen/bootcamp_tingchen_chen/homework/homework3/src/hw3_utils.py
CSV exists: True


In [3]:
import time


arr = np.array([1, 2, 3, 4], dtype=float)
print("arr + 10:", arr + 10)
print("arr * 2:", arr * 2)
print("arr ** 2:", arr ** 2)


N = 1_000_000
a = np.random.rand(N)
b = np.random.rand(N)

t0 = time.perf_counter()
c_vec = a + b
vec_time = time.perf_counter() - t0

t0 = time.perf_counter()
c_loop = np.empty_like(a)
for i in range(N):
    c_loop[i] = a[i] + b[i]
loop_time = time.perf_counter() - t0

print(f"Vectorized time: {vec_time:.4f}s | Loop time: {loop_time:.4f}s | speedup ~{loop_time/vec_time:.1f}x")


arr + 10: [11. 12. 13. 14.]
arr * 2: [2. 4. 6. 8.]
arr ** 2: [ 1.  4.  9. 16.]
Vectorized time: 0.0017s | Loop time: 0.1084s | speedup ~63.0x


In [4]:

df = pd.read_csv(CSV_PATH)


display(df.head())


df.info()


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes


In [5]:

numeric_summary = get_summary_stats(df)
display(numeric_summary)


category_col = find_first_categorical(df)
print("Detected category column:", category_col)

by_cat = None
if category_col:
  
    by_cat = get_summary_stats(df, by=category_col)
    display(by_cat.head())
else:
   print("No categorical column found. If you know the column name, you can manually specify it: category_col = 'your_column_name'")


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
value,10.0,17.6,7.381659,10.0,12.25,14.5,23.25,30.0


Detected category column: category


Unnamed: 0_level_0,value_count,value_mean,value_std,value_min,value_max
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,4,11.5,1.290994,10,13
B,3,15.666667,2.081666,14,18
C,3,27.666667,2.516611,25,30


In [6]:

save_outputs(numeric_summary, by_cat, PROCESSED_DIR, base_name="summary")
print("Saved outputs to:", PROCESSED_DIR)


Saved outputs to: /Users/chen/bootcamp_tingchen_chen/homework/homework3/data/processed


In [7]:
numeric_cols = df.select_dtypes(include='number').columns.tolist()
print("Numeric columns:", numeric_cols)

if category_col and numeric_cols:
    value_col = numeric_cols[0]      
    plot_path = PROCESSED_DIR / "basic_plot.png"
    basic_plot_by_category_mean(df, category_col, value_col, plot_path)
    print("Saved plot to:", plot_path)
else:
    print("Unable to plot: requires at least one categorical column and one numerical column.")


Numeric columns: ['value']
Saved plot to: /Users/chen/bootcamp_tingchen_chen/homework/homework3/data/processed/basic_plot.png


In [8]:
numeric_summary2 = get_summary_stats(df)
by_cat2 = get_summary_stats(df, by=category_col) if category_col else None

save_outputs(numeric_summary2, by_cat2, PROCESSED_DIR, base_name="summary_from_utils")
print("Done! Check:", PROCESSED_DIR)


Done! Check: /Users/chen/bootcamp_tingchen_chen/homework/homework3/data/processed
