In [57]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
import os

ROOT = "c:\\Users\\1004c\\Desktop\\통계청 논문 공모전"

In [102]:
df = pd.read_csv(os.path.join(ROOT, "Data", "T4_Y1", "df_cf_T4.csv"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6309 entries, 0 to 6308
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   X1_성별                  6309 non-null   int64  
 1   X1_연령대                 6309 non-null   object 
 2   X1_졸업평점점수              6309 non-null   float64
 3   X1_졸업평점결측              6309 non-null   int64  
 4   X1_혼인여부                6309 non-null   int64  
 5   X1_부모학력                6309 non-null   float64
 6   X1_부모소득구간              6309 non-null   object 
 7   X2_설립유형_국공립            6309 non-null   int64  
 8   X2_소재지_지방              6309 non-null   int64  
 9   X2_전공그룹                6309 non-null   object 
 10  X2_학교유형_4년제이상          6309 non-null   int64  
 11  X3_진로교육_참여개수           6309 non-null   int64  
 12  X3_취업준비활동_참여개수         6309 non-null   int64  
 13  X3_대학_재학_중_일자리_경험의_유무  6309 non-null   int64  
 14  X3_어학연수_경험_횟수          6309 non-null   int64  
 15  X3_자

- 부모 학력 재정의

In [103]:
# 부모학력을 5단계로 범주화
def map_parent_edu(x):
    if x <= 9:
        return "중졸이하"
    elif x <= 12:
        return "고졸"
    elif x <= 13:
        return "전문대졸"
    elif x <= 16:
        return "대졸"
    else:
        return "대학원졸"

df["X1_부모학력_범주"] = df["X1_부모학력"].apply(map_parent_edu)

### Categorical

In [104]:
from scipy.stats import ttest_ind

df["T4_내일배움카드_참여경험"] = df["T4_내일배움카드_참여경험"].map({"Treated": 1, "Control": 0})

treat_col = "T4_내일배움카드_참여경험"
group1 = df[df[treat_col] == 1]
group0 = df[df[treat_col] == 0]

# 부모학력 재분류
df["X1_부모학력_범주"] = df["X1_부모학력"].apply(map_parent_edu)

cat_vars = [
    "X1_성별", "X1_연령대", "X1_부모소득구간", "X2_학교유형_4년제이상", 
    "X2_설립유형_국공립", "X2_소재지_지방", "X2_전공그룹", "X1_부모학력_범주"
]

results = []

for var in cat_vars:
    dummies = pd.get_dummies(df[var], prefix=var)
    df_dummies = pd.concat([df[[treat_col]], dummies], axis=1)

    for col in dummies.columns:
        g1 = df_dummies[df_dummies[treat_col] == 1]
        g0 = df_dummies[df_dummies[treat_col] == 0]

        mean1 = g1[col].mean() * 100
        mean0 = g0[col].mean() * 100
        t_stat, p_val = ttest_ind(g1[col].astype(int), g0[col].astype(int), equal_var=False)

        def mark(p):
            if p < 0.001: return '***'
            elif p < 0.01: return '**'
            elif p < 0.05: return '*'
            else: return ''

        results.append({
            "변수": col,
            "참여자 (%)": round(mean1, 2),
            "비참여자 (%)": round(mean0, 2),
            "t": f"{round(t_stat, 2)}{mark(p_val)}"
        })


### Binary

In [105]:
binary_cols = ["X3_대학_재학_중_일자리_경험의_유무", "X1_혼인여부"]

for col in binary_cols:
    mean1 = group1[col].mean() * 100
    mean0 = group0[col].mean() * 100
    t_stat, p_val = ttest_ind(group1[col], group0[col], equal_var=False)

    results.append({
        "변수": col.replace("_", " "),  # 또는 더 예쁜 이름으로 매핑
        "참여자 (%)": round(mean1, 2),
        "비참여자 (%)": round(mean0, 2),
        "t": f"{round(t_stat, 2)}{mark(p_val)}"
    })

### Numeric / Continuous

- Y 변수 분포 비교 추가

In [106]:
# 결과 변수 (연속형 or 이진형 비교)
y_col = "Y_임금근로자_첫_일자리"

mean1 = group1[y_col].mean()
mean0 = group0[y_col].mean()
t_stat, p_val = ttest_ind(group1[y_col], group0[y_col], equal_var=False)

results.append({
    "변수": y_col,
    "참여자 (%)": round(mean1, 2),
    "비참여자 (%)": round(mean0, 2),
    "t": f"{round(t_stat, 2)}{mark(p_val)}"
})

In [107]:
# 연속형 변수
cont_vars = [
    "X1_졸업평점점수", "X3_진로교육_참여개수", "X3_취업준비활동_참여개수",
    "X3_어학연수_경험_횟수", "X3_자격증_소지_개수"  # 부모학력 원값, 결과 변수
]

for col in cont_vars:
    mean1 = group1[col].mean()
    mean0 = group0[col].mean()
    t_stat, p_val = ttest_ind(group1[col], group0[col], equal_var=False)
    results.append({
        "변수": col,
        "참여자 (%)": round(mean1, 2),
        "비참여자 (%)": round(mean0, 2),
        "t": f"{round(t_stat, 2)}{mark(p_val)}"
    })

In [108]:
summary_df = pd.DataFrame(results)
summary_df

Unnamed: 0,변수,참여자 (%),비참여자 (%),t
0,X1_성별_0,39.36,34.93,2.43*
1,X1_성별_1,60.64,65.07,-2.43*
2,X1_연령대_20_24,16.63,17.26,-0.46
3,X1_연령대_25_29,60.39,59.35,0.57
4,X1_연령대_30_34,2.93,2.46,0.76
5,X1_연령대_Missing,20.05,20.93,-0.58
6,X1_부모소득구간_Missing,32.89,31.25,0.93
7,X1_부모소득구간_고소득,20.42,22.67,-1.49
8,X1_부모소득구간_저소득,20.05,18.27,1.19
9,X1_부모소득구간_중소득,26.65,27.81,-0.7


In [109]:
file_path = os.path.join(ROOT, "Data", "T4_Y1", "summary_T4.csv")
summary_df.to_csv(file_path, index=False, encoding="utf-8-sig")

In [111]:
df["T4_내일배움카드_참여경험"].value_counts()
# df.info()

T4_내일배움카드_참여경험
0    5491
1     818
Name: count, dtype: int64