In [1]:
import numpy as np
np.__version__
a = np.array([1, 2, 3])             # 1D
b = np.array([[1, 2], [3, 4]])      # 2D
a.ndim, a.shape, a.dtype
b.ndim, b.shape, b.dtype


(2, (2, 2), dtype('int64'))

In [2]:
z = np.zeros((2,3))
o = np.ones((3,))
e = np.empty((2,2))
r = np.arange(0, 10, 2)
l = np.linspace(0, 1, 5)


In [3]:
c = np.array([[10,11,12],[13,14,15],[16,17,18]])
c[0, 1]          # 11
c[:, 0]          # first column
c[1:, 1:]        # subarray
c[::-1, ::-1]    # reverse


array([[18, 17, 16],
       [15, 14, 13],
       [12, 11, 10]])

In [4]:
x = np.array([1,2,3], dtype=float)
y = np.array([4,5,6], dtype=float)
x + y
x * y
np.sqrt(x)
np.exp(y)


array([ 54.59815003, 148.4131591 , 403.42879349])

In [5]:
M = np.ones((3,3))
v = np.array([1,2,3])
M + v    # v is broadcast across rows


array([[2., 3., 4.],
       [2., 3., 4.],
       [2., 3., 4.]])

In [6]:
d = np.arange(12).reshape(3,4)   # 3×4
d.sum()                          # all elements
d.sum(axis=0)                    # by column
d.mean(axis=1)                   # by row
d.T                               # transpose


array([[ 0,  4,  8],
       [ 1,  5,  9],
       [ 2,  6, 10],
       [ 3,  7, 11]])

In [7]:
np.save("arr.npy", d)         # binary
loaded = np.load("arr.npy")
np.savetxt("arr.csv", d, fmt="%d", delimiter=",")  # text


In [8]:
# build, slice, compute
A = np.arange(1,13).reshape(3,4)
row2 = A[1]
col3 = A[:, 2]
sub  = A[:2, :2]
calc = (A * 2 + 5) / A
res  = A.sum(axis=0)


In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)


In [10]:
csv_path = Path("tech") / "merged3_with_puma_counties.csv"
df = pd.read_csv(csv_path)

# normalize headers
df.columns = (
    df.columns.str.strip().str.lower().str.replace(" ", "_")
)
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62031 entries, 0 to 62030
Columns: 303 entries, rt to name_y
dtypes: float64(96), int64(200), object(7)
memory usage: 143.4+ MB


In [12]:
import pandas as pd

print(df_small.columns.tolist())
print([c for c in df_small.columns if "income" in c])
print([c for c in df_small.columns if "grad" in c])

INCOME_COL = "annual_income"          
GRAD_COL   = "hs_graduation_rate"     
COUNTY_COL = "county"

for c in [INCOME_COL, GRAD_COL]:
    if c in df_small.columns:
        df_small[c] = pd.to_numeric(df_small[c], errors="coerce")

if INCOME_COL in df_small.columns:
    high_income = df_small.loc[df_small[INCOME_COL] >= 100_000].copy()
else:
    high_income = df_small.iloc[0:0].copy()  # empty fallback

if COUNTY_COL in df_small.columns:
    prince_georges = df_small.loc[df_small[COUNTY_COL] == "Prince George's County"].copy()

if GRAD_COL in df_small.columns:
    top_grad = df_small.sort_values(GRAD_COL, ascending=False).head(10)


['puma']
[]
[]


In [14]:
for c in ["annual_income","hs_graduation_rate"]:
    if c in df_small:
        df_small[c] = pd.to_numeric(df_small[c], errors="coerce")


In [15]:
if "annual_income" in df_small:
    bins = [-1, 25000, 50000, 75000, 100000, 150000, 10**12]
    labels = ["≤25k","25–50k","50–75k","75–100k","100–150k","≥150k"]
    df_small["income_bin"] = pd.cut(df_small["annual_income"], bins=bins, labels=labels)

if "device_ownership" in df_small:
    df_small["device_yes"] = (df_small["device_ownership"].astype(str).str.lower() == "yes").astype(int)
if "internet_access" in df_small:
    df_small["internet_yes"] = (df_small["internet_access"].astype(str).str.lower() == "yes").astype(int)


In [16]:
# by county
if {"county","hs_graduation_rate","annual_income"} <= set(df_small.columns):
    by_county = (
        df_small.groupby("county", as_index=False)
        .agg(
            n=("hs_graduation_rate","size"),
            grad_mean=("hs_graduation_rate","mean"),
            income_median=("annual_income","median"),
            device_rate=("device_yes","mean") if "device_yes" in df_small else ("hs_graduation_rate","size"),
            internet_rate=("internet_yes","mean") if "internet_yes" in df_small else ("hs_graduation_rate","size"),
        )
        .sort_values("grad_mean", ascending=False)
    )

# by income level
if "income_bin" in df_small.columns:
    by_income = (
        df_small.groupby("income_bin", as_index=False)
        .agg(
            n=("hs_graduation_rate","size") if "hs_graduation_rate" in df_small else ("annual_income","size"),
            grad_mean=("hs_graduation_rate","mean") if "hs_graduation_rate" in df_small else ("annual_income","median"),
            device_rate=("device_yes","mean") if "device_yes" in df_small else ("annual_income","median"),
            internet_rate=("internet_yes","mean") if "internet_yes" in df_small else ("annual_income","median"),
            income_median=("annual_income","median") if "annual_income" in df_small else ("hs_graduation_rate","mean"),
        )
        .sort_values("income_median")
    )


In [17]:
if "income_bin" in df_small and "device_ownership" in df_small:
    device_ct = pd.crosstab(df_small["income_bin"], df_small["device_ownership"], normalize="index").round(3)
if "income_bin" in df_small and "internet_access" in df_small:
    net_ct = pd.crosstab(df_small["income_bin"], df_small["internet_access"], normalize="index").round(3)


In [18]:
if 'by_county' in locals():
    by_county.to_csv("county_summary.csv", index=False)
if 'by_income' in locals():
    by_income.to_csv("income_summary.csv", index=False)
