
# Project 1 

Dataset:`2012_SAT_Results_20251105.csv`  



In [7]:

import pandas as pd
import csv, re
from pathlib import Path

DATA_PATH = Path('2012_SAT_Results_20251105.csv')
assert DATA_PATH.exists(), f"Dataset not found at {DATA_PATH}"


##1) Load Dataset

In [8]:

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head(10)


(478, 6)


Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355,404,363
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383,423,366
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377,402,370
3,01M458,FORSYTH SATELLITE ACADEMY,7,414,401,359
4,01M509,MARTA VALLE HIGH SCHOOL,44,390,433,384
5,01M515,LOWER EAST SIDE PREPARATORY HIGH SCHOOL,112,332,557,316
6,01M539,"NEW EXPLORATIONS INTO SCIENCE, TECHNOLOGY AND ...",159,522,574,525
7,01M650,CASCADES HIGH SCHOOL,18,417,418,411
8,01M696,BARD HIGH SCHOOL EARLY COLLEGE,130,624,604,628
9,02M047,47 THE AMERICAN SIGN LANGUAGE AND ENGLISH SECO...,16,395,400,387


## 2) pandas: mean/median/mode

In [9]:

numeric_col = "SAT Math Avg. Score"
df[numeric_col] = pd.to_numeric(df[numeric_col], errors="coerce")
mean_p = df[numeric_col].mean()
median_p = df[numeric_col].median()
mode_p = df[numeric_col].mode().iloc[0]

print("PANDAS RESULTS")
print(f"Column: {numeric_col}")
print(f"Mean:   {mean_p:.2f}")
print(f"Median: {median_p}")
print(f"Mode:   {mode_p}")


PANDAS RESULTS
Column: SAT Math Avg. Score
Mean:   413.37
Median: 395.0
Mode:   385.0


Hard way 

In [10]:

values = []
with open(DATA_PATH, newline="") as f:
    for row in csv.DictReader(f):
        raw = str(row[numeric_col])
        cleaned = re.sub(r"[^0-9\.-]", "", raw)
        if cleaned in {"", "-", "."}:
            continue
        try:
            values.append(float(cleaned))
        except ValueError:
            pass

# Mean
mean_h = sum(values) / len(values) if values else float("nan")

# Median
vals_sorted = sorted(values)
n = len(vals_sorted)
if n == 0:
    median_h = float("nan")
elif n % 2 == 1:
    median_h = vals_sorted[n // 2]
else:
    median_h = (vals_sorted[n // 2 - 1] + vals_sorted[n // 2]) / 2

# Mode
freq = {}
for v in vals_sorted:
    freq[v] = freq.get(v, 0) + 1
mode_h = max(freq, key=freq.get) if freq else float("nan")

print("HARD-WAY RESULTS (stdlib only)")
print(f"Count:  {n}")
print(f"Mean:   {mean_h:.6f}")
print(f"Median: {median_h}")
print(f"Mode:   {mode_h}")


HARD-WAY RESULTS (stdlib only)
Count:  421
Mean:   413.368171
Median: 395.0
Mode:   385.0


## 4) Visualization 

In [11]:

bins = list(range(200, 801, 50))
labels = [f"{b:3d}-{b+49:3d}" for b in bins[:-1]] + ["750-799"]
counts = {lab: 0 for lab in labels}

def label_for(v):
    if v >= 750:
        return "750-799"
    for b in bins[:-1]:
        if b <= v <= b+49:
            return f"{b:3d}-{b+49:3d}"
    return None

for v in values:
    lab = label_for(v)
    if lab:
        counts[lab] += 1

max_count = max(counts.values()) if counts else 1
print(f"ASCII Histogram for: {numeric_col}\n")
for lab in labels:
    bar_len = int((counts[lab] / max_count) * 50) if max_count else 0
    print(f"{lab}: {'#'*bar_len} {counts[lab]}")


ASCII Histogram for: SAT Math Avg. Score

200-249:  0
250-299:  0
300-349: ###### 26
350-399: ################################################## 200
400-449: ############################ 113
450-499: ############ 48
500-549: ### 12
550-599: ### 13
600-649:  2
650-699: # 6
700-749:  1
750-799:  0
750-799:  0



## 5) Reflection
Pandas makes aggregation concise and reliable; the stdlib version makes each step explicit and reinforces how stats are computed. 
