In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("/content/yrbs_2023_sadc_vars.csv")


yes_vars = [
    "sleep_8plus", "sad_hopeless", "miss_school_unsafe",
    "bullied_school", "electronic_bullied",
    "pe_daily", "soda_2plus", "breakfast_7day"
]

for c in yes_vars:
    df[c + "_yes"] = np.where(df[c].isna(), np.nan, (df[c] == 1).astype(int))


df["sleep_sufficient"] = df["sleep_8plus_yes"]

def diff_prop_ci(p1, n1, p0, n0, z=1.96):
    se = np.sqrt(p1*(1-p1)/n1 + p0*(1-p0)/n0)
    return (p1 - p0, (p1 - p0) - z*se, (p1 - p0) + z*se)

def rate(series):
    s = series.dropna()
    return s.mean(), s.shape[0]


sub = df.dropna(subset=["sleep_sufficient", "sad_hopeless_yes"]).copy()

p_insuf, n_insuf = rate(sub.loc[sub["sleep_sufficient"]==0, "sad_hopeless_yes"])
p_suf,   n_suf   = rate(sub.loc[sub["sleep_sufficient"]==1, "sad_hopeless_yes"])

diff, lo, hi = diff_prop_ci(p_insuf, n_insuf, p_suf, n_suf)

print("Sad/Hopeless | sleep insufficient:", p_insuf, "n=", n_insuf)
print("Sad/Hopeless | sleep sufficient  :", p_suf, "n=", n_suf)
print("Difference (insuf - suf):", diff, "95% CI:", (lo, hi))

# Plot
plt.bar(["<8 hours", "≥8 hours"], [p_insuf, p_suf])
plt.ylabel("Proportion reporting sad/hopeless")
plt.title("Sad/Hopeless by Sleep (YRBS 2023 SADC)")
plt.show()


sub = df.dropna(subset=["sleep_sufficient", "miss_school_unsafe_yes"]).copy()

p_insuf, n_insuf = rate(sub.loc[sub["sleep_sufficient"]==0, "miss_school_unsafe_yes"])
p_suf,   n_suf   = rate(sub.loc[sub["sleep_sufficient"]==1, "miss_school_unsafe_yes"])

diff, lo, hi = diff_prop_ci(p_insuf, n_insuf, p_suf, n_suf)

print("Missed school unsafe | sleep insufficient:", p_insuf, "n=", n_insuf)
print("Missed school unsafe | sleep sufficient  :", p_suf, "n=", n_suf)
print("Difference (insuf - suf):", diff, "95% CI:", (lo, hi))

plt.bar(["<8 hours", "≥8 hours"], [p_insuf, p_suf])
plt.ylabel("Proportion missing school (felt unsafe)")
plt.title("Attendance Disruption by Sleep (YRBS 2023 SADC)")
plt.show()

sub = df.dropna(subset=["sleep_sufficient", "sad_hopeless_yes", "miss_school_unsafe_yes"]).copy()

grp = (sub.groupby(["sleep_sufficient", "sad_hopeless_yes"])["miss_school_unsafe_yes"]
         .agg(["mean", "count"])
         .reset_index())

print(grp)

# Plot: lines across sleep for sad vs not sad
for sad in [0, 1]:
    d = grp[grp["sad_hopeless_yes"]==sad].sort_values("sleep_sufficient")
    plt.plot(d["sleep_sufficient"], d["mean"], marker="o", label=f"sad={sad}")

plt.xticks([0,1], ["<8 hours", "≥8 hours"])
plt.ylabel("P(missed school due to unsafe)")
plt.title("Sleep vs Attendance Disruption, split by Sad/Hopeless")
plt.legend()
plt.show()

def compare_by_sleep(outcome_yes_col, label):
    sub = df.dropna(subset=["sleep_sufficient", outcome_yes_col]).copy()
    p_insuf, n_insuf = rate(sub.loc[sub["sleep_sufficient"]==0, outcome_yes_col])
    p_suf,   n_suf   = rate(sub.loc[sub["sleep_sufficient"]==1, outcome_yes_col])
    diff, lo, hi = diff_prop_ci(p_insuf, n_insuf, p_suf, n_suf)
    print(label, "insuf:", p_insuf, "suf:", p_suf, "diff:", diff, "CI:", (lo,hi))
    return p_insuf, p_suf

p1, p2 = compare_by_sleep("bullied_school_yes", "Bullied at school")
plt.bar(["<8 hours", "≥8 hours"], [p1, p2])
plt.title("Bullied at School by Sleep")
plt.ylabel("Proportion bullied")
plt.show()

p1, p2 = compare_by_sleep("electronic_bullied_yes", "Electronically bullied")
plt.bar(["<8 hours", "≥8 hours"], [p1, p2])
plt.title("Electronic Bullying by Sleep")
plt.ylabel("Proportion electronically bullied")
plt.show()


sub = df.dropna(subset=["grade", "sleep_sufficient", "pe_daily_yes"]).copy()

sleep_by_grade = sub.groupby("grade")["sleep_sufficient"].mean()
pe_by_grade = sub.groupby("grade")["pe_daily_yes"].mean()

print("Sleep sufficient by grade:\n", sleep_by_grade)
print("\nDaily PE by grade:\n", pe_by_grade)

plt.plot(sleep_by_grade.index, sleep_by_grade.values, marker="o")
plt.xticks([1,2,3,4], ["9th","10th","11th","12th"])
plt.ylabel("P(sleep ≥8 hours)")
plt.title("Sleep Sufficiency Declines Across High School")
plt.show()

plt.plot(pe_by_grade.index, pe_by_grade.values, marker="o")
plt.xticks([1,2,3,4], ["9th","10th","11th","12th"])
plt.ylabel("P(daily PE)")
plt.title("Daily PE Declines Across High School")
plt.show()

# Sleep by PE
sub = df.dropna(subset=["pe_daily_yes", "sleep_sufficient"]).copy()
p_pe, n_pe = rate(sub.loc[sub["pe_daily_yes"]==1, "sleep_sufficient"])
p_no, n_no = rate(sub.loc[sub["pe_daily_yes"]==0, "sleep_sufficient"])
diff, lo, hi = diff_prop_ci(p_pe, n_pe, p_no, n_no)
print("Sleep sufficient | PE daily vs not:", p_pe, p_no, "diff:", diff, "CI:", (lo,hi))

plt.bar(["No daily PE", "Daily PE"], [p_no, p_pe])
plt.ylabel("P(sleep ≥8 hours)")
plt.title("Daily PE and Sleep Sufficiency")
plt.show()

# Sadness by PE
sub = df.dropna(subset=["pe_daily_yes", "sad_hopeless_yes"]).copy()
p_pe, n_pe = rate(sub.loc[sub["pe_daily_yes"]==1, "sad_hopeless_yes"])
p_no, n_no = rate(sub.loc[sub["pe_daily_yes"]==0, "sad_hopeless_yes"])
diff, lo, hi = diff_prop_ci(p_pe, n_pe, p_no, n_no)
print("Sad/hopeless | PE daily vs not:", p_pe, p_no, "diff:", diff, "CI:", (lo,hi))

plt.bar(["No daily PE", "Daily PE"], [p_no, p_pe])
plt.ylabel("P(sad/hopeless)")
plt.title("Daily PE and Sad/Hopeless")
plt.show()

