In [1]:
import pandas as pd
from pathlib import Path

csv_path = Path("tech") / "merged3_with_puma_counties.csv"
df = pd.read_csv(csv_path)


In [2]:
df.shape               # (rows, cols)
df.columns.tolist()    # column names
df.dtypes              # data types
df.head(5)             # first rows
df.info()              # nulls + dtypes
df.sample(5, random_state=0)  # quick spot-check


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62031 entries, 0 to 62030
Columns: 303 entries, RT to Name_y
dtypes: float64(96), int64(200), object(7)
memory usage: 143.4+ MB


Unnamed: 0,RT,SERIALNO,DIVISION,SPORDER,PUMA,REGION,STATE,ADJINC,PWGTP,AGEP,...,has_laptop,has_tablet,has_tel,PUMA_clean,Name_x,income_adj,income_category,ACCESSINET,access_inet,Name_y
2084,P,2023GQ0144726,5,1,507,3,24,1019518,10,87,...,,,,507,"Baltimore County--Catonsville, Woodlawn & Arbu...",70550.6456,Middle,,,"Baltimore County--Catonsville, Woodlawn & Arbu..."
48784,P,2023HU1136915,5,1,400,3,24,1019518,89,65,...,1.0,1.0,1.0,400,Carroll County,101951.8,Upper-Middle,1.0,1.0,Carroll County
19479,P,2023HU0416340,5,2,1001,3,24,1019518,66,89,...,1.0,1.0,1.0,1001,"Montgomery County (North & West)--Olney, Damas...",32624.576,Middle,1.0,1.0,"Montgomery County (North & West)--Olney, Damas..."
61352,P,2023HU1444401,5,1,302,3,24,1019518,84,74,...,1.0,2.0,1.0,302,Frederick County (Central)--Greater Frederick ...,95834.692,Upper-Middle,1.0,1.0,Frederick County (Central)--Greater Frederick ...
39581,P,2023HU0910870,5,4,1008,3,24,1019518,67,16,...,1.0,1.0,1.0,1008,,1835.1324,Low,1.0,1.0,


In [3]:
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(" ", "_"))


In [4]:
# columns
subset = df[["county", "puma"]].copy()          # label selection
# rows by position vs label
row_pos = df.iloc[0]                             # first row (positional)
some_rows = df.loc[df["county"] == "Prince George's County"]  # label filter


KeyError: "['county'] not in index"

In [None]:
# numeric conversion
for c in ["annual_income", "hs_graduation_rate"]:
    if c in df: df[c] = pd.to_numeric(df[c], errors="coerce")

# derived columns
if "annual_income" in df:
    df["income_k"] = (df["annual_income"] / 1000).round(1)

# boolean flags
for c in ["device_ownership", "internet_access"]:
    if c in df:
        df[c] = df[c].astype(str).str.lower().str.strip()
df["device_yes"]   = (df.get("device_ownership") == "yes").astype("Int8") if "device_ownership" in df else pd.Series(dtype="Int8")
df["internet_yes"] = (df.get("internet_access") == "yes").astype("Int8") if "internet_access" in df else pd.Series(dtype="Int8")


In [None]:
hi_income = df.loc[df.get("annual_income", 0) >= 100_000] if "annual_income" in df else df.iloc[0:0]
top_grad  = df.sort_values("hs_graduation_rate", ascending=False)[:10] if "hs_graduation_rate" in df else df


In [None]:
df.isna().sum().sort_values(ascending=False).head(15)
df_clean = df.dropna(subset=["hs_graduation_rate"]) if "hs_graduation_rate" in df else df
if "annual_income" in df:
    df["annual_income"] = df["annual_income"].fillna(df["annual_income"].median())


In [None]:
import pandas as pd
from pathlib import Path

csv_path = Path("tech") / "merged3_with_puma_counties.csv"
df = pd.read_csv(csv_path)

df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(" ", "_")
)

print("\n=== SHAPE ===")
print(df.shape)

print("\n=== COLUMNS ===")
print(df.columns.tolist())

print("\n=== HEAD(5) ===")
print(df.head(5).to_string(index=False))

print("\n=== INFO ===")
df.info()

print("\n=== NULL COUNTS (TOP 20) ===")
print(df.isna().sum().sort_values(ascending=False).head(20))

expected = ["county","puma","annual_income","device_ownership","internet_access","hs_graduation_rate"]
present  = [c for c in expected if c in df.columns]
missing  = [c for c in expected if c not in df.columns]
print("\n=== PRESENT EXPECTED COLS ===")
print(present)
print("\n=== MISSING EXPECTED COLS ===")
print(missing)

for c in ["annual_income","hs_graduation_rate"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "county" in df.columns:
    print("\n=== COUNTY COUNTS (TOP 10) ===")
    print(df["county"].value_counts().head(10))

if "hs_graduation_rate" in df.columns:
    print("\n=== HS_GRADUATION_RATE: DESCRIBE ===")
    print(df["hs_graduation_rate"].describe())

if "annual_income" in df.columns:
    print("\n=== ANNUAL_INCOME: DESCRIBE ===")
    print(df["annual_income"].describe())

if {"county","hs_graduation_rate"} <= set(df.columns):
    by_county = (df.groupby("county", as_index=False)
                   .agg(n=("hs_graduation_rate","size"),
                        grad_mean=("hs_graduation_rate","mean"))
                   .sort_values("grad_mean", ascending=False))
    print("\n=== BY COUNTY: N AND MEAN GRAD RATE (TOP 10) ===")
    print(by_county.head(10).to_string(index=False))

if {"annual_income"} <= set(df.columns):
    bins   = [-1, 25_000, 50_000, 75_000, 100_000, 150_000, 10**12]
    labels = ["≤25k","25–50k","50–75k","75–100k","100–150k","≥150k"]
    df["income_bin"] = pd.cut(df["annual_income"], bins=bins, labels=labels)
    print("\n=== INCOME BIN COUNTS ===")
    print(df["income_bin"].value_counts(dropna=False))

if {"income_bin","device_ownership"} <= set(df.columns):
    device_ct = pd.crosstab(df["income_bin"], df["device_ownership"], normalize="index").round(3)
    print("\n=== DEVICE BY INCOME_BIN (ROW PROPORTIONS) ===")
    print(device_ct)

if {"income_bin","internet_access"} <= set(df.columns):
    net_ct = pd.crosstab(df["income_bin"], df["internet_access"], normalize="index").round(3)
    print("\n=== INTERNET BY INCOME_BIN (ROW PROPORTIONS) ===")
    print(net_ct)

if "annual_income" in df.columns:
    hi = df.loc[df["annual_income"] >= 100_000]
    print("\n=== HIGH INCOME ROWS: COUNT ===")
    print(len(hi))

if "county" in df.columns:
    pg = df.loc[df["county"] == "Prince George's County"]
    print("\n=== PRINCE GEORGE'S COUNTY ROWS: COUNT ===")
    print(len(pg))

print("\n=== DONE PRINTING ===")
