# 03_regression_analysis.ipynb

전처리 결과(Parquet)를 로드해 앱 사용량(카테고리)과 수면지표의 주효과 및 조절효과(상호작용) 를 단계적 회귀로 검증합니다. 반복측정(week within uid)을 고려해 cluster-robust 표준오차(그룹=uid) 를 사용합니다. 결과표/VIF/조절효과 그래프를 results/에 저장합니다. (데이터 비공개)

셀 1 — 로드/컬럼 정리

In [None]:
# TL;DR: Parquet 로드 → 중심화/상호작용(여기서만) → OLS(클러스터 SE) → VIF/플롯 저장
import pyarrow.parquet as pq
import pandas as pd, numpy as np
from pathlib import Path

Path("results/figures").mkdir(parents=True, exist_ok=True)
Path("results/tables").mkdir(parents=True, exist_ok=True)

PARQUET_IN = "results/tables/processed_weekly.parquet"
df = pq.read_table(PARQUET_IN).to_pandas()
df.columns = [c.replace('.', '_') for c in df.columns]


셀 2 — 중심화/상호작용

In [None]:
apps = ['AppCategory_GAME','AppCategory_MAPS','AppCategory_NEWS',
        'AppCategory_PRODUCTIVITY','AppCategory_SOCIAL','AppCategory_VIDEO']

# *_hours 존재 시 그걸 사용
apps = [c if c in df.columns else f"{c}_hours" for c in apps]
y_vars = ['PHQ9_score','GAD7_score','Stress_score']

# 중심화
for c in apps:
    df[f"{c}_c"] = df[c] - df[c].mean()
df["sleep_c"] = df["mean_confidence_sleep"] - df["mean_confidence_sleep"].mean()

# 상호작용
for c in apps:
    short = c.replace("AppCategory_","").replace("_hours","")
    df[f"{short}_x_sleep"] = df[f"{c}_c"] * df["sleep_c"]

# cluster SE를 위한 그룹
df["uid"] = df["uid"].astype("category")


셀 3 — 단계적 회귀(클러스터-강건 SE)

In [None]:
import statsmodels.formula.api as smf

def fit_steps(dep, app):
    """Step1: app_c / Step2: + sleep_c / Step3: + interaction"""
    short = app.replace("AppCategory_","").replace("_hours","")
    f1 = f"{dep} ~ {app}_c"
    f2 = f"{dep} ~ {app}_c + sleep_c"
    f3 = f"{dep} ~ {app}_c + sleep_c + {short}_x_sleep"
    kw = dict(cov_type="cluster", cov_kwds={"groups": df["uid"]})
    return (smf.ols(f1, data=df).fit(**kw),
            smf.ols(f2, data=df).fit(**kw),
            smf.ols(f3, data=df).fit(**kw))

results = {}
for app in apps:
    results[app] = {dep: fit_steps(dep, app) for dep in y_vars}

# Step3 요약 표 저장
rows=[]
for app in apps:
    short = app.replace("AppCategory_","").replace("_hours","")
    for dep in y_vars:
        m = results[app][dep][2]
        rows.append({
            "Category": short, "Dependent": dep, "R2": m.rsquared,
            "App_t": m.tvalues.get(f"{app}_c", np.nan),
            "App_p": m.pvalues.get(f"{app}_c", np.nan),
            "Sleep_t": m.tvalues.get("sleep_c", np.nan),
            "Sleep_p": m.pvalues.get("sleep_c", np.nan),
            "Inter_t": m.tvalues.get(f"{short}_x_sleep", np.nan),
            "Inter_p": m.pvalues.get(f"{short}_x_sleep", np.nan),
        })
summary_df = pd.DataFrame(rows)
summary_df.to_csv("results/tables/moderation_step3_APA.csv", index=False)
summary_df.head()


셀 4 — VIF(상호작용 포함)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
vif_rows=[]
for app in apps:
    short = app.replace("AppCategory_","").replace("_hours","")
    vars_v = [f"{app}_c","sleep_c",f"{short}_x_sleep"]
    X = df[vars_v].values
    for i,var in enumerate(vars_v):
        vif_rows.append({"Category":short,"Variable":var,"VIF":vif(X,i)})

vif_df = pd.DataFrame(vif_rows)
vif_df.to_csv("results/tables/vif_by_category.csv", index=False)
vif_df.head()


셀 5 — 조절효과 플롯(유의한 것만 저장)

In [None]:
import matplotlib.pyplot as plt

def plot_moderation(dep, model, app_short, app_c_col):
    x = np.linspace(df[app_c_col].min(), df[app_c_col].max(), 200)
    mean, std = df["sleep_c"].mean(), df["sleep_c"].std()
    inter = f"{app_short}_x_sleep"

    b0 = model.params.get("Intercept",0)
    b1 = model.params.get(app_c_col,0)
    b2 = model.params.get("sleep_c",0)
    b3 = model.params.get(inter,0)

    y_low  = b0 + b1*x + b2*(mean-std) + b3*x*(mean-std)
    y_mid  = b0 + b1*x + b2* mean      + b3*x* mean
    y_high = b0 + b1*x + b2*(mean+std) + b3*x*(mean+std)

    plt.figure(figsize=(10,6))
    plt.plot(x,y_low, label='Low Sleep (-1 SD)')
    plt.plot(x,y_mid, label='Mean Sleep')
    plt.plot(x,y_high,label='High Sleep (+1 SD)')
    plt.xlabel(f'{app_short} Usage (centered)'); plt.ylabel(dep)
    plt.title(f'Moderation: {app_short} × Sleep on {dep}')
    plt.legend(); plt.grid(True)
    out = f"results/figures/moderation_{app_short}_{dep}.png"
    plt.tight_layout(); plt.savefig(out, dpi=180); plt.show()
    print("→ 저장:", out)

# 예: PHQ9만 도식 (원하시면 GAD7/Stress도 반복)
for app in apps:
    short = app.replace("AppCategory_","").replace("_hours","")
    m = results[app]["PHQ9_score"][2]
    if m.pvalues.get(f"{short}_x_sleep", 1.0) < 0.05:
        plot_moderation("PHQ9_score", m, short, f"{app}_c")


셀 6 — 앱 사용 범주 집계 → 피벗

In [None]:
app_usage_agg = (
    app_usage_rdd
    .map(lambda r: ((r['uid'], r['week'], r['category']), r['duration']))
    .reduceByKey(lambda a, b: a + b)
    .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1]))
)
app_usage_df = spark.createDataFrame(app_usage_agg, schema=["uid","week","category","duration"])

app_usage_pivot = (app_usage_df.groupBy("uid","week")
                   .pivot("category")
                   .sum("duration")
                   .fillna(0))


셀 7 — 수면 지표 집계 + 주의(시간 문자열)

In [None]:
# 7 — 수면 다이어리: "HH:MM:SS" → 분(min) 환산 후 주 평균
from pyspark.sql.functions import split, col, avg

sleep_diary_df = sleep_diary_rdd.filter(lambda r: r['week'] is not None).toDF()

sleep_diary_df = (
    sleep_diary_df
    .withColumn("parts", split(col("midawake_duration"), ":"))
    .withColumn(
        "midawake_duration_min",
        col("parts").getItem(0).cast("double") * 60 +        # HH → 분
        col("parts").getItem(1).cast("double") +             # MM
        col("parts").getItem(2).cast("double") / 60.0        # SS → 분
    )
)

sleep_diary_agg = (
    sleep_diary_df
    .groupBy("uid", "week")
    .agg(avg("midawake_duration_min").alias("midawake_duration_sleep"))
)


셀 8 — 병합 & 결측 대치

In [None]:
df_merged = (app_usage_pivot
             .join(response_scores_df, on=["uid","week"], how="inner")
             .join(sleep_agg,          on=["uid","week"], how="left")
             .join(sleep_diary_agg,    on=["uid","week"], how="left"))

avg_conf  = df_merged.select(_mean("mean_confidence_sleep")).first()[0]
avg_awake = df_merged.select(_mean("midawake_duration_sleep")).first()[0]
df_merged = df_merged.fillna({
    "mean_confidence_sleep":   avg_conf,
    "midawake_duration_sleep": avg_awake
})

df_merged.printSchema()
df_merged.show(5, truncate=False)

셀 9 — Pandas로 변환, 컬럼명 정리(‘.’→‘_’), 중심화/상호작용 생성

In [None]:
results = {'Step1': {}, 'Step2': {}, 'Step3': {}}

for category in app_categories:
    short = category.replace("AppCategory_", "")
    cat_centered = f"{category}_centered"
    interaction = f"{short}_sleep_interaction"

    results['Step1'][short] = {}
    results['Step2'][short] = {}
    results['Step3'][short] = {}

    for dep in dependent_vars:
        f1 = f"{dep} ~ {cat_centered}"
        f2 = f"{dep} ~ {cat_centered} + mean_confidence_sleep_centered"
        f3 = f"{dep} ~ {cat_centered} + mean_confidence_sleep_centered + {interaction}"

        results['Step1'][short][dep] = smf.ols(f1, data=df_pd).fit()
        results['Step2'][short][dep] = smf.ols(f2, data=df_pd).fit()
        results['Step3'][short][dep] = smf.ols(f3, data=df_pd).fit()

# 콘솔 요약(원본 출력 형식 유지)
print("\n=== 회귀분석 결과 요약 ===")
for step in ['Step1','Step2','Step3']:
    print(f"\n--- {step} 결과 ---")
    for category in results[step]:
        print(f"\n카테고리: {category}")
        for dep in dependent_vars:
            m = results[step][category][dep]
            print(f"  {dep}:")
            print(f"    R-squared: {m.rsquared:.3f}")
            for var in m.params.index:
                print(f"    {var}: t={m.tvalues[var]:.3f}, p={m.pvalues[var]:.3f}")

# APA 스타일 표(DataFrame) 저장
rows = []
for category in results['Step3']:
    interaction_term = f"{category}_sleep_interaction"
    category_centered = f"AppCategory_{category}_centered"
    for dep in dependent_vars:
        m = results['Step3'][category][dep]
        rows.append({
            'Category': category,
            'Dependent': dep,
            'R2': m.rsquared,
            'App_t': m.tvalues.get(category_centered, np.nan),
            'App_p': m.pvalues.get(category_centered, np.nan),
            'Sleep_t': m.tvalues.get('mean_confidence_sleep_centered', np.nan),
            'Sleep_p': m.pvalues.get('mean_confidence_sleep_centered', np.nan),
            'Interaction_t': m.tvalues.get(interaction_term, np.nan),
            'Interaction_p': m.pvalues.get(interaction_term, np.nan)
        })
summary_df = pd.DataFrame(rows)
summary_path = OUT_TBL / "moderation_step3_APA.csv"
summary_df.to_csv(summary_path, index=False)
print("\n→ 저장:", summary_path)


셀 11 — VIF + 저장

In [None]:
vif_rows = []
for category in app_categories:
    cat_short = category.replace('AppCategory_', '')
    vars_vif = [f"{category}_centered", "mean_confidence_sleep_centered", f"{cat_short}_sleep_interaction"]
    X = df_pd[vars_vif].values
    vifs = [variance_inflation_factor(X, i) for i in range(len(vars_vif))]
    for var, vif in zip(vars_vif, vifs):
        vif_rows.append({"Category": cat_short, "Variable": var, "VIF": vif})

vif_df = pd.DataFrame(vif_rows)
vif_path = OUT_TBL / "vif_by_category.csv"
vif_df.to_csv(vif_path, index=False)
print("→ 저장:", vif_path)

셀 12 — 조절효과 플롯(원본 로직 유지) + 저장

In [None]:
def plot_moderation(dep_var, model, category_centered, category_short, out_dir=OUT_FIG):
    x = np.linspace(df_pd[category_centered].min(), df_pd[category_centered].max(), 200)
    mean_conf = df_pd['mean_confidence_sleep_centered'].mean()
    std_conf  = df_pd['mean_confidence_sleep_centered'].std()
    inter = f"{category_short}_sleep_interaction"

    y_low  = (model.params['Intercept']
              + model.params.get(category_centered,0)*x
              + model.params.get('mean_confidence_sleep_centered',0)*(mean_conf - std_conf)
              + model.params.get(inter,0)*x*(mean_conf - std_conf))
    y_mean = (model.params['Intercept']
              + model.params.get(category_centered,0)*x
              + model.params.get('mean_confidence_sleep_centered',0)*mean_conf
              + model.params.get(inter,0)*x*mean_conf)
    y_high = (model.params['Intercept']
              + model.params.get(category_centered,0)*x
              + model.params.get('mean_confidence_sleep_centered',0)*(mean_conf + std_conf)
              + model.params.get(inter,0)*x*(mean_conf + std_conf))

    plt.figure(figsize=(10,6))
    plt.plot(x, y_low,  label='Low Sleep Quality (-1 SD)', color='red')
    plt.plot(x, y_mean, label='Mean Sleep Quality',        color='blue')
    plt.plot(x, y_high, label='High Sleep Quality (+1 SD)',color='green')
    plt.xlabel(f'{category_short} Usage (centered)')
    plt.ylabel(dep_var)
    plt.title(f'Moderation Effect: {category_short} × Sleep on {dep_var}')
    plt.legend(); plt.grid(True)
    out_path = out_dir / f"moderation_{category_short}_{dep_var}.png"
    plt.tight_layout(); plt.savefig(out_path, dpi=180); plt.show()
    print("→ 저장:", out_path)

print("\n=== 유의미한 조절효과 시각화 ===")
for category in results['Step3']:
    cat_centered = f"AppCategory_{category}_centered"
    inter = f"{category}_sleep_interaction"
    for dep in dependent_vars:
        m = results['Step3'][category][dep]
        if m.pvalues.get(inter, 1.0) < 0.05:
            print(f"Plotting moderation effect for {category} on {dep}")
            plot_moderation(dep, m, cat_centered, category)

셀 13 — 종료

In [None]:
spark.stop()