# 전체 vs 신속통합기획 집단 기초통계 및 분포 비교

본 노트북은 전체 정비사업 데이터에서 `신속통합기획` 컬럼 값이 존재(값이 비어있지 않고 NaN 아님)하는 사업을 "신속통합기획 그룹"(is_fast=1), 값이 비어있거나 NaN 인 사업을 "일반 사업 그룹"(is_fast=0)으로 구분하여 면적, 토지등소유자수, 세대수 관련 기초통계 및 분포/검정 차이를 분석합니다.

구성:
1. 환경설정 및 라이브러리
2. 데이터 불러오기/구조 확인/표준화
3. 결측 및 정제
4. 신속통합기획 그룹 정의(값 존재 여부 기반)
5. 표본 규모 비교
6. 기초 통계량 / 비교표
7. 분포(Hist/KDE, Box/Violin, CDF)
8. 로그 변환 및 왜도 개선
9. 이상치(IQR, Z-score)
10. 상관분석
11. 정규성/등분산성 검정
12. 그룹 간 통계 검정 (t-test / Mann-Whitney / KS)
13. 효과크기(Cohen's d, Cliff's delta)
14. 요약결과 테이블
15. 결과 저장


In [89]:
# 1. 환경설정 및 라이브러리 임포트
import os, sys, math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats

# (선택) pingouin 사용 가능 시 효과크기 / 추가 통계 확장
try:
    import pingouin as pg
    HAS_PINGOUIN = True
except ImportError:
    HAS_PINGOUIN = False

np.random.seed(42)

# -------------------- 한글 폰트 설정 (NanumGothic 우선) --------------------
import matplotlib.font_manager as fm
from pathlib import Path as _Path
import tempfile
import urllib.request

def _download_nanum(dest_dir: str):
    url = 'https://github.com/google/fonts/raw/main/ofl/nanumgothic/NanumGothic-Regular.ttf'
    local_path = _Path(dest_dir) / 'NanumGothic-Regular.ttf'
    try:
        urllib.request.urlretrieve(url, local_path)
        print(f"[FONT][DL] NanumGothic 다운로드 완료: {local_path}")
        fm.fontManager.addfont(str(local_path))
        fm._rebuild()
        return str(local_path)
    except Exception as e:
        print('[FONT][DL][WARN] NanumGothic 다운로드 실패:', e)
        return None

def select_korean_font(preferred=("NanumGothic", "Nanum Gothic", "나눔고딕"), allow_download=True):
    # available: {폰트이름: 경로}
    available = {f.name: f.fname for f in fm.fontManager.ttflist}
    # 1) 나눔 계열 탐색 (이름 매칭)
    for target in preferred:
        for name, path in available.items():
            if target.lower().replace(' ','') in name.lower().replace(' ',''):
                print(f"[FONT] Using '{name}' at {path}")
                return name, path
    # 1-추가) 다운로드 시도
    if allow_download:
        tmpdir = tempfile.gettempdir()
        dl_path = _download_nanum(tmpdir)
        if dl_path and _Path(dl_path).exists():
            # 재스캔
            available = {f.name: f.fname for f in fm.fontManager.ttflist}
            for target in preferred:
                for name, path in available.items():
                    if target.lower().replace(' ','') in name.lower().replace(' ',''):
                        print(f"[FONT] Using downloaded '{name}' at {path}")
                        return name, path
    # 2) Windows 기본 'Malgun Gothic'
    for fallback in ("Malgun Gothic", "맑은 고딕"):
        for name, path in available.items():
            if fallback.lower().replace(' ','') in name.lower().replace(' ',''):
                print(f"[FONT] Using fallback '{name}'")
                return name, path
    # 3) macOS 기본
    for fallback in ("AppleGothic",):
        for name, path in available.items():
            if fallback.lower().replace(' ','') in name.lower().replace(' ',''):
                print(f"[FONT] Using fallback '{name}'")
                return name, path
    print('[FONT][WARN] 한글 가능한 폰트를 찾지 못했습니다. 기본 Sans-Serif 사용')
    return 'Sans-Serif', None

KFONT, KFONT_PATH = select_korean_font()
plt.rcParams['font.family'] = KFONT
plt.rcParams['axes.unicode_minus'] = False

# FontProperties 객체 (직접 주입용)
from matplotlib.font_manager import FontProperties
FONT_PROP = FontProperties(fname=KFONT_PATH) if KFONT_PATH else None

def apply_korean_font(fig):
    """Figure 내 모든 Text 객체에 명시적으로 한국어 폰트 적용."""
    if FONT_PROP is None:
        return
    for txt in fig.findobj(match=mpl.text.Text):
        try:
            txt.set_fontproperties(FONT_PROP)
        except Exception:
            pass

# ---------------------------------------------------------------------------

sns.set_context('notebook')
sns.set_style('whitegrid')
print('[INFO] HAS_PINGOUIN =', HAS_PINGOUIN)
print('[INFO] Active Korean font =', KFONT, '| path =', KFONT_PATH)


[FONT] Using 'NanumGothic' at C:\Users\Woojin Cho\AppData\Local\Microsoft\Windows\Fonts\NanumGothicBold.ttf
[INFO] HAS_PINGOUIN = True
[INFO] Active Korean font = NanumGothic | path = C:\Users\Woojin Cho\AppData\Local\Microsoft\Windows\Fonts\NanumGothicBold.ttf


In [90]:
# 2. 원시 데이터 불러오기 (경로 자동 탐색 포함)
import glob
from pathlib import Path

# 노트북 실행 위치(보통 notebooks/)와 상위 디렉터리를 순회하며 outputs 위치 탐색
NOTEBOOK_CWD = Path().resolve()
SEARCH_ROOTS = [NOTEBOOK_CWD] + list(NOTEBOOK_CWD.parents)  # 현재 → 루트까지
TARGET_FILENAME = '주택재개발_DATA_full.csv'
DATA_PATH = None

# 1) 정형 경로 후보 생성 (notebooks/ 하위가 아니라 상위 outputs 폴더에 있으므로 ../outputs 우선)
relative_candidates = [
    Path('outputs')/TARGET_FILENAME,          # (노트북과 동일 폴더일 경우)
    Path('..')/'outputs'/TARGET_FILENAME,     # 일반적인 repo 구조 (../outputs/...)
    Path('../..')/'outputs'/TARGET_FILENAME,  # 혹시 더 상위
]
for cand in relative_candidates:
    if cand.exists():
        DATA_PATH = cand.resolve()
        break

# 2) 상위 디렉터리 루프에서 outputs/ 파일 직접 확인
if DATA_PATH is None:
    for root in SEARCH_ROOTS:
        cand = root / 'outputs' / TARGET_FILENAME
        if cand.exists():
            DATA_PATH = cand.resolve()
            break

# 3) 그래도 못 찾으면 glob 검색 (pattern: **/주택재개발_DATA_full*.csv)
if DATA_PATH is None:
    patterns = ['**/주택재개발_DATA_full*.csv']
    matches = []
    for pat in patterns:
        matches.extend(glob.glob(pat, recursive=True))
    # 필터: .ipynb_checkpoints 등 제외
    matches = [m for m in matches if '.ipynb_checkpoints' not in m]
    if matches:
        matches = sorted(matches, key=len)
        DATA_PATH = Path(matches[0]).resolve()

if DATA_PATH is None or (not DATA_PATH.exists()):
    raise FileNotFoundError('주택재개발_DATA_full.csv 파일을 찾을 수 없습니다. notebooks/ 상위 outputs 폴더 존재 여부를 확인하세요.')

print('[INFO] 선택된 DATA_PATH =', DATA_PATH)
raw = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
print('[LOAD] shape =', raw.shape)
print('[COLS]', list(raw.columns)[:15], '...')
raw.head(3)

[INFO] 선택된 DATA_PATH = C:\Users\Woojin Cho\Documents\GitHub\Urban-Refurbishing-Project_DATA-SET\outputs\주택재개발_DATA_full.csv
[LOAD] shape = (145, 55)
[COLS] ['사업번호', '신속통합기획', '자치구', '법정동', '운영구분', '진행단계', '상태', '토지등 소유자 수', '정비구역명칭', '정비구역위치', '정비구역면적(㎡)', '건축연면적(㎡)', '용도지역', '용도지구', '택지면적(㎡)'] ...


Unnamed: 0,사업번호,신속통합기획,자치구,법정동,운영구분,진행단계,상태,토지등 소유자 수,정비구역명칭,정비구역위치,...,조합설립인가(사업시행자 지정일),건축심의,사업시행인가_최초,사업시행인가_변경(최종),관리처분계획인가_최초,관리처분계획인가_변경(최종),이주시작일,이주종료일,착공,세대총합계
0,11200-100002000,,성동구,용답동,조합,일반분양승인,운영,627,용답동 주택재개발정비사업조합,성동구 용답동 108-1번지,...,2012-07-27,2015-12-24,2016-12-29,2023-10-12,2019-08-30,2022-09-08,,2019-11-01,2021-09-06,958
1,11740-900001027,1차선정구역,강동구,천호동,공공지원자,정비구역지정,운영,251,천호 A1-2구역,강동구 천호동 461-31,...,,,,,,,,,,781
2,11740-900001152,기존구역(신통추진),강동구,천호동,조합,조합설립인가,운영,189,천호동532-2번지 주택정비형 재개발사업,강동구 천호동 532-2,...,2024-11-08,,,,,,,,,728


In [91]:
# 3. 데이터 구조 확인 및 컬럼 표준화
# 한글 → 영문 스네이크케이스 매핑 정의 (요청 변수 포함 확장)
col_map = {
    '사업번호':'project_id',
    '신속통합기획':'fast_flag',
    '자치구':'district',
    '정비구역면적(㎡)':'area_sqm',          # 정비구역면적
    '정비구역면적':'area_sqm',               # 변형 대비
    '토지등 소유자 수':'owners',
    '토지등소유자수':'owners',
    '건축연면적':'building_gfa',             # 건축연면적
    '기존가구수':'existing_households',       # 기존가구수
    '건폐율':'coverage_ratio',               # % 또는 실수
    '용적률':'floor_area_ratio',             # % 또는 실수
    '지상층수':'floors',
    '구역지정최초':'first_designation_year', # 최초 지정 연도
    '임대세대총수':'rental_households',
    '세대총합계':'total_households',
    '세대수합계':'total_households',         # 변형 대비
}
# 존재하는 컬럼만 적용
apply_map = {k:v for k,v in col_map.items() if k in raw.columns}

df = raw.rename(columns=apply_map).copy()

# 공백/콤마/따옴표/% 제거 후 숫자 변환 대상
num_cols_raw = [
    'area_sqm','owners','building_gfa','existing_households',
    'coverage_ratio','floor_area_ratio','floors',
    'first_designation_year','rental_households','total_households'
]
for c in num_cols_raw:
    if c in df.columns:
        df[c] = (df[c].astype(str)
                   .str.replace(',','', regex=False)
                   .str.replace('%','', regex=False)
                   .str.replace('"','', regex=False)
                   .str.strip())
        # 빈 문자열을 NaN으로
        df[c] = df[c].replace({'':None, 'nan':None, 'NaN':None})
        df[c] = pd.to_numeric(df[c], errors='coerce')

# 파생: 공공임대비율 = 임대세대총수 / 세대총합계
if 'rental_households' in df.columns and 'total_households' in df.columns:
    denom = df['total_households'].replace({0:np.nan})
    df['rental_ratio'] = df['rental_households'] / denom

print('[INFO] 변환 후 dtypes:')
print(df[[c for c in apply_map.values() if c in df.columns]].dtypes.head(20))
if 'rental_ratio' in df.columns:
    print('[INFO] rental_ratio 생성 OK (count=', df['rental_ratio'].notna().sum(), ')')

print('[INFO] fast_flag 값 분포:')
if 'fast_flag' in df.columns:
    print(df['fast_flag'].value_counts(dropna=False).head())
else:
    print('fast_flag 없음')

df.head(5)

[INFO] 변환 후 dtypes:
project_id            object
fast_flag             object
district              object
area_sqm             float64
owners                 int64
coverage_ratio       float64
floor_area_ratio     float64
floors               float64
rental_households    float64
total_households     float64
dtype: object
[INFO] rental_ratio 생성 OK (count= 123 )
[INFO] fast_flag 값 분포:
fast_flag
NaN           120
1차선정구역         11
기존구역(신통추진)     10
2차선정구역          4
Name: count, dtype: int64


Unnamed: 0,project_id,fast_flag,district,법정동,운영구분,진행단계,상태,owners,정비구역명칭,정비구역위치,...,건축심의,사업시행인가_최초,사업시행인가_변경(최종),관리처분계획인가_최초,관리처분계획인가_변경(최종),이주시작일,이주종료일,착공,total_households,rental_ratio
0,11200-100002000,,성동구,용답동,조합,일반분양승인,운영,627,용답동 주택재개발정비사업조합,성동구 용답동 108-1번지,...,2015-12-24,2016-12-29,2023-10-12,2019-08-30,2022-09-08,,2019-11-01,2021-09-06,958.0,0.227557
1,11740-900001027,1차선정구역,강동구,천호동,공공지원자,정비구역지정,운영,251,천호 A1-2구역,강동구 천호동 461-31,...,,,,,,,,,781.0,0.174136
2,11740-900001152,기존구역(신통추진),강동구,천호동,조합,조합설립인가,운영,189,천호동532-2번지 주택정비형 재개발사업,강동구 천호동 532-2,...,,,,,,,,,728.0,0.105769
3,11290-100016013,,성북구,장위동,추진위원회,조합설립추진위원회승인,운영,392,장위3구역주택재개발정비사업조합,성북구 장위동 305,...,,,,,,,,,1078.0,0.170686
4,11305-900000028,,강북구,미아동,조합,조합설립인가,운영,1582,미아2재정비촉진구역,강북구 미아동 403번지,...,,,,,,,,,3519.0,0.17164


In [92]:
# 4. 결측치 및 기본 정제
# 새 코어 변수 리스트 구성 (요청 변수 포함)
core_vars = [c for c in [
    'owners',            # 토지등소유자수
    'area_sqm',          # 정비구역면적
    'building_gfa',      # 건축연면적
    'existing_households', # 기존가구수
    'coverage_ratio',    # 건폐율
    'floor_area_ratio',  # 용적률
    'floors',            # 지상층수
    'first_designation_year', # 구역지정최초 (연도)
    'rental_ratio'       # 공공임대비율
] if c in df.columns]

missing_report = df[core_vars].isna().sum()
print('[MISSING]\n', missing_report)
print('[MISSING] 비율:\n', (missing_report/len(df)).round(3))

# 음수/0 면적 제거 (면적이 있으면)
if 'area_sqm' in df.columns:
    before = len(df)
    df = df[(df['area_sqm'].isna()) | (df['area_sqm'] > 0)]
    print(f'[FILTER] area_sqm <=0 제거 {before} -> {len(df)}')

# 5. 신속통합기획 그룹 정의 (값 존재 여부 기반)
if 'fast_flag' in df.columns:
    df['fast_flag_str'] = df['fast_flag'].astype(str).str.strip()
    df['is_fast'] = (~df['fast_flag'].isna()) & (df['fast_flag_str']!='') & (df['fast_flag_str'].str.lower()!='nan')
else:
    df['is_fast'] = False

print('[FAST] is_fast TRUE count:', df['is_fast'].sum())

# 6. 표본 규모 비교
n_total = len(df)
n_fast = df['is_fast'].sum()
print(f'[SIZE] 전체 {n_total}, 신속통합기획 {n_fast} ({(n_fast/n_total if n_total else 0):.2%})')

# 7. 기초 통계량 계산 함수
from collections import OrderedDict

def basic_stats(series: pd.Series):
    s = series.dropna().astype(float)
    if len(s)==0:
        return {k:np.nan for k in ['count','mean','median','std','min','max','p25','p75','skew','kurt']}
    return OrderedDict([
        ('count', len(s)),
        ('mean', s.mean()),
        ('median', s.median()),
        ('std', s.std(ddof=1)),
        ('min', s.min()),
        ('max', s.max()),
        ('p25', s.quantile(0.25)),
        ('p75', s.quantile(0.75)),
        ('skew', s.skew()),
        ('kurt', s.kurt()),
    ])

stats_overall = {v: basic_stats(df[v]) for v in core_vars}
stats_fast = {v: basic_stats(df.loc[df['is_fast'], v]) for v in core_vars}
stats_nonfast = {v: basic_stats(df.loc[~df['is_fast'], v]) for v in core_vars}

print('[STATS][전체]')
for k,v in stats_overall.items():
    print(k, v)
print('\n[STATS][신속통합기획]')
for k,v in stats_fast.items():
    print(k, v)
print('\n[STATS][일반]')
for k,v in stats_nonfast.items():
    print(k, v)

# 8. 비교표 생성
rows = []
for v in core_vars:
    o = stats_overall[v]; f = stats_fast[v]; nf = stats_nonfast[v]
    rows.append({
        '변수': v,
        '전체_mean(포함)': o['mean'],
        '신속_mean': f['mean'],
        '일반_mean': nf['mean'],
        'mean_ratio_fast/overall': f['mean']/o['mean'] if o['mean'] else np.nan,
        'mean_ratio_fast/nonfast': f['mean']/nf['mean'] if nf['mean'] else np.nan,
        '전체_median(포함)': o['median'],
        '신속_median': f['median'],
        '일반_median': nf['median'],
        'median_diff_fast-nonfast': f['median']-nf['median'],
        '전체_std(포함)': o['std'],
        '신속_std': f['std'],
        '일반_std': nf['std']
    })
compare_df = pd.DataFrame(rows)
compare_df

[MISSING]
 owners               0
area_sqm             0
coverage_ratio       3
floor_area_ratio     1
floors              18
rental_ratio        22
dtype: int64
[MISSING] 비율:
 owners              0.000
area_sqm            0.000
coverage_ratio      0.021
floor_area_ratio    0.007
floors              0.124
rental_ratio        0.152
dtype: float64
[FILTER] area_sqm <=0 제거 145 -> 145
[FAST] is_fast TRUE count: 25
[SIZE] 전체 145, 신속통합기획 25 (17.24%)
[STATS][전체]
owners OrderedDict({'count': 145, 'mean': np.float64(699.8206896551724), 'median': np.float64(544.0), 'std': np.float64(592.6721230176926), 'min': np.float64(0.0), 'max': np.float64(3887.0), 'p25': np.float64(279.0), 'p75': np.float64(889.0), 'skew': np.float64(1.9314821434386014), 'kurt': np.float64(5.846938746918189)})
area_sqm OrderedDict({'count': 145, 'mean': np.float64(73060.64475862069), 'median': np.float64(60080.8), 'std': np.float64(59231.50646152301), 'min': np.float64(1929.2), 'max': np.float64(393729.0), 'p25': np.float64

Unnamed: 0,변수,전체_mean(포함),신속_mean,일반_mean,mean_ratio_fast/overall,mean_ratio_fast/nonfast,전체_median(포함),신속_median,일반_median,median_diff_fast-nonfast,전체_std(포함),신속_std,일반_std
0,owners,699.82069,805.16,677.875,1.150523,1.187771,544.0,695.0,524.0,171.0,592.672123,557.006409,599.720699
1,area_sqm,73060.644759,83930.4596,70796.1,1.148778,1.185524,60080.8,78371.0,56693.15,21677.85,59231.506462,51492.167352,60668.65754
2,coverage_ratio,28.895423,35.2156,27.544957,1.218726,1.278477,24.855,28.27,24.82,3.45,11.948193,16.405632,10.363061
3,floor_area_ratio,255.060625,262.8312,253.428151,1.030466,1.037103,247.23,253.33,246.57,6.76,52.510251,39.743014,54.81578
4,floors,25.685039,27.913043,25.192308,1.086743,1.107999,25.0,28.0,25.0,3.0,9.946445,12.670341,9.240137
5,rental_ratio,0.171901,0.174896,0.171136,1.017426,1.021969,0.170659,0.170361,0.170673,-0.000312,0.064574,0.035872,0.070166


In [93]:
# 9~15. 분포 / 변환 / 이상치 / 상관 / 검정 / 효과크기 / 요약 저장

import itertools
from pathlib import Path
from datetime import datetime

PLOT_DIR = Path('outputs/notebook_stats')
PLOT_DIR.mkdir(parents=True, exist_ok=True)

# 실행(세션) 단위 타임스탬프: 모든 이미지/시각화 파일명에 공통 부여
TS = datetime.now().strftime('%Y%m%d_%H%M%S')
print(f'[INFO] Visualization timestamp TS={TS}')

# 9. 히스토그램 & KDE (소표본/단일값 안전 처리)
for v in core_vars:
    if v not in df.columns:
        continue
    sub = df[['is_fast', v]].dropna()
    if len(sub) < 2:
        print(f'[HIST][SKIP] {v} 유효 값 < 2 (len={len(sub)}).')
        continue
    # 그룹별 카운트 / 고유값 체크
    grp_counts = sub.groupby('is_fast')[v].count()
    unique_total = sub[v].nunique()
    # KDE 가능 조건: (전체 유효값 >= 2) AND (전체 고유값 >= 2) AND (fast/일반 각자 2개 이상일 때만 안전)
    kde_ok = (unique_total >= 2) and all(grp_counts >= 2)
    if not kde_ok:
        reason = []
        if unique_total < 2: reason.append('unique_total<2')
        if any(grp_counts < 2): reason.append('group_size<2')
        print(f'[HIST] {v} -> KDE 비활성 ({";".join(reason)})')
    try:
        fig = plt.figure(figsize=(6,4))
        ax = plt.gca()
        sns.histplot(data=sub, x=v, hue='is_fast', kde=kde_ok, stat='density',
                     common_norm=False, bins=30, alpha=0.4)
        ax.set_title(f'분포(히스토그램+KDE): {v}')
        ax.set_xlabel(v)
        # 범례 라벨 명시적 조정
        handles, labels = ax.get_legend_handles_labels()
        if labels:
            # seaborn이 is_fast를 True/False로 표시할 수 있으므로 정렬 매핑
            lab_map = {'True': '신속', 'False': '일반', '1': '신속', '0': '일반'}
            labels = [lab_map.get(l, l) for l in labels]
            ax.legend(handles, labels)
        apply_korean_font(fig)
        plt.tight_layout()
        fig.savefig(PLOT_DIR / f'hist_{v}_{TS}.png', dpi=150)
        plt.close(fig)
    except Exception as e:
        print(f'[HIST][ERROR] {v}: {e}')
        plt.close('all')

# 10. Box & Violin
for v in core_vars:
    if v not in df.columns: continue
    sub = df[['is_fast', v]].dropna()
    if len(sub) == 0:
        print(f'[BOX][SKIP] {v} 값 없음')
        continue
    try:
        fig, axes = plt.subplots(1,2, figsize=(8,4))
        sns.boxplot(data=sub, x='is_fast', y=v, ax=axes[0])
        sns.violinplot(data=sub, x='is_fast', y=v, ax=axes[1], cut=0)
        axes[0].set_title(f'Box: {v}')
        axes[1].set_title(f'Violin: {v}')
        for ax in axes:
            ax.set_xlabel('신속통합기획 여부 (0=일반,1=신속)')
            ax.set_xticklabels(['0','1'])
        fig.suptitle(f'분포 비교 - {v}')
        apply_korean_font(fig)
        fig.tight_layout()
        fig.savefig(PLOT_DIR / f'box_violin_{v}_{TS}.png', dpi=150)
        plt.close(fig)
    except Exception as e:
        print(f'[BOX][ERROR] {v}: {e}')
        plt.close('all')

# 11. CDF 비교 + KS
cdf_rows = []
for v in core_vars:
    if v not in df.columns: continue
    data_fast = df.loc[df['is_fast'], v].dropna().sort_values()
    data_non = df.loc[~df['is_fast'], v].dropna().sort_values()
    if len(data_fast) == 0 or len(data_non) == 0:
        print(f'[CDF][SKIP] {v} fast({len(data_fast)}) / nonfast({len(data_non)}) 값 부족')
        continue
    cdf_fast = np.arange(1, len(data_fast)+1)/len(data_fast)
    cdf_non = np.arange(1, len(data_non)+1)/len(data_non)
    stat_ks, p_ks = stats.ks_2samp(data_fast, data_non)
    fig = plt.figure(figsize=(6,4))
    plt.plot(data_non, cdf_non, label='일반 CDF')
    plt.plot(data_fast, cdf_fast, label='신속 CDF')
    plt.title(f'CDF 비교 (KS p={p_ks:.3g}): {v}')
    plt.xlabel(v)
    plt.ylabel('누적비율')
    plt.legend()
    apply_korean_font(fig)
    plt.tight_layout()
    fig.savefig(PLOT_DIR / f'cdf_{v}_{TS}.png', dpi=150)
    plt.close(fig)
    cdf_rows.append({'var':v,'ks_stat':stat_ks,'ks_p':p_ks})

# 12. 로그 변환 후 왜도
for v in core_vars:
    if v in df.columns and (df[v] > 0).sum() > 0:
        df[f'log1p_{v}'] = np.log1p(df[v].clip(lower=0))

skew_compare = []
for v in core_vars:
    if v in df.columns and f'log1p_{v}' in df.columns:
        skew_before = df[v].dropna().skew()
        skew_after = df[f'log1p_{v}'].dropna().skew()
        skew_compare.append({'var':v,'skew_before':skew_before,'skew_after_log1p':skew_after})
skew_compare_df = pd.DataFrame(skew_compare)

# 13. 이상치 탐지(IQR, Z-score)
def outlier_iqr(s: pd.Series):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    mask = (s < lower) | (s > upper)
    return mask, (lower, upper)

def outlier_zscore(s: pd.Series, thresh=3.0):
    mu, sd = s.mean(), s.std(ddof=1)
    if sd == 0:
        return pd.Series(False, index=s.index), (mu, sd)
    z = (s - mu)/sd
    mask = z.abs() > thresh
    return mask, (mu, sd)

outlier_summary = []
for v in core_vars:
    if v not in df.columns: continue
    s = df[v].dropna()
    if len(s)==0: continue
    mask_iqr, fence = outlier_iqr(s)
    mask_z, params = outlier_zscore(s)
    outlier_summary.append({
        'var': v,
        'N': len(s),
        'iqr_outliers': mask_iqr.sum(),
        'fence_low': fence[0],
        'fence_high': fence[1],
        'z_outliers': mask_z.sum(),
        'mean': params[0],
        'std': params[1]
    })
outlier_df = pd.DataFrame(outlier_summary)

# 14. 상관분석 (전체 / 신속 / 일반)
corr_targets = [c for c in core_vars if c in df.columns]
if len(corr_targets) > 1:
    corr_all = df[corr_targets].corr(method='pearson')
    fig = plt.figure(figsize=(0.6*len(corr_targets)+2,0.6*len(corr_targets)+2))
    sns.heatmap(corr_all, annot=True, fmt='.2f', cmap='Blues')
    plt.title('상관행렬(전체)')
    apply_korean_font(fig)
    plt.tight_layout()
    fig.savefig(PLOT_DIR / f'corr_overall_{TS}.png', dpi=150)
    plt.close(fig)

    corr_fast = df.loc[df['is_fast'], corr_targets].corr(method='pearson')
    fig = plt.figure(figsize=(0.6*len(corr_targets)+2,0.6*len(corr_targets)+2))
    sns.heatmap(corr_fast, annot=True, fmt='.2f', cmap='Reds', vmin=-1, vmax=1)
    plt.title('상관행렬(신속)')
    apply_korean_font(fig)
    plt.tight_layout()
    fig.savefig(PLOT_DIR / f'corr_fast_{TS}.png', dpi=150)
    plt.close(fig)

    corr_non = df.loc[~df['is_fast'], corr_targets].corr(method='pearson')
    fig = plt.figure(figsize=(0.6*len(corr_targets)+2,0.6*len(corr_targets)+2))
    sns.heatmap(corr_non, annot=True, fmt='.2f', cmap='Greens', vmin=-1, vmax=1)
    plt.title('상관행렬(일반)')
    apply_korean_font(fig)
    plt.tight_layout()
    fig.savefig(PLOT_DIR / f'corr_nonfast_{TS}.png', dpi=150)
    plt.close(fig)

# 15~17. 검정 & 효과크기 (신속 vs 일반)

def decide_tests(x, y):
    def norm_ok(a):
        if len(a) < 3: return False
        if len(a) > 5000: return True
        stat, p = stats.shapiro(a)
        return p > 0.05
    nx, ny = len(x), len(y)
    norm_x, norm_y = norm_ok(x), norm_ok(y)
    lev_p = stats.levene(x, y, center='median').pvalue if nx>2 and ny>2 else 1.0
    equal_var = lev_p > 0.05
    if norm_x and norm_y:
        test_name = 't-test'
        t_res = stats.ttest_ind(x, y, equal_var=equal_var)
        stat_v, p_v = t_res.statistic, t_res.pvalue
    else:
        test_name = 'mannwhitney'
        mw = stats.mannwhitneyu(x, y, alternative='two-sided')
        stat_v, p_v = mw.statistic, mw.pvalue
    ks_stat, ks_p = stats.ks_2samp(x, y)
    return {
        'test': test_name,
        'stat': stat_v,
        'p_value': p_v,
        'equal_var': equal_var,
        'ks_stat': ks_stat,
        'ks_p': ks_p
    }

def cohens_d(x, y):
    x, y = np.asarray(x), np.asarray(y)
    nx, ny = len(x), len(y)
    if nx < 2 or ny < 2:
        return np.nan
    sx, sy = x.std(ddof=1), y.std(ddof=1)
    sp = math.sqrt(((nx-1)*sx**2 + (ny-1)*sy**2)/(nx+ny-2)) if (nx+ny-2) > 0 else np.nan
    if sp == 0: return 0.0
    return (x.mean() - y.mean())/sp

def cliffs_delta(x, y):
    x_sorted = np.sort(x); y_sorted = np.sort(y)
    nx, ny = len(x_sorted), len(y_sorted)
    i = j = more = less = 0
    while i < nx and j < ny:
        if x_sorted[i] > y_sorted[j]:
            more += nx - i; j += 1
        elif x_sorted[i] < y_sorted[j]:
            less += ny - j; i += 1
        else:
            i += 1; j += 1
    return (more - less)/(nx*ny)

results_rows = []
for v in core_vars:
    if v not in df.columns: continue
    x = df.loc[~df['is_fast'], v].dropna().values
    y = df.loc[df['is_fast'], v].dropna().values
    if len(x) < 3 or len(y) < 3:
        print(f'[STATS][SKIP] {v} 표본 부족 (nonfast={len(x)}, fast={len(y)})')
        continue
    tst = decide_tests(x, y)
    d = cohens_d(y, x)  # 신속 - 일반
    cd = cliffs_delta(y, x)
    results_rows.append({
        'var': v,
        'test': tst['test'],
        'p_value': tst['p_value'],
        'equal_var': tst['equal_var'],
        'ks_p': tst['ks_p'],
        'cohens_d': d,
        'cliffs_delta': cd,
        'mean_fast': y.mean(),
        'mean_nonfast': x.mean(),
        'diff_mean_fast-nonfast': y.mean()-x.mean()
    })

results_df = pd.DataFrame(results_rows)

interpret = []
for _, r in results_df.iterrows():
    d = abs(r['cohens_d'])
    if d < 0.2: lvl = 'small-or-less'
    elif d < 0.5: lvl = 'small'
    elif d < 0.8: lvl = 'medium'
    else: lvl = 'large'
    interpret.append(lvl)
results_df['effect_size_label'] = interpret

# 18. 요약 테이블 & 저장
summary_out = compare_df.merge(results_df[['var','p_value','cohens_d','cliffs_delta','effect_size_label']], left_on='변수', right_on='var', how='left').drop(columns=['var'])
summary_out

# 19. 산출물 저장
summary_out_path = PLOT_DIR / 'summary_stats.csv'
summary_out.to_csv(summary_out_path, index=False, encoding='utf-8-sig')
results_df.to_csv(PLOT_DIR / 'stat_tests.csv', index=False, encoding='utf-8-sig')
outlier_df.to_csv(PLOT_DIR / 'outliers.csv', index=False, encoding='utf-8-sig')
skew_compare_df.to_csv(PLOT_DIR / 'skew_compare.csv', index=False, encoding='utf-8-sig')
pd.DataFrame(cdf_rows).to_csv(PLOT_DIR / 'cdf_ks.csv', index=False, encoding='utf-8-sig')

print('[SAVE] summary_stats.csv, stat_tests.csv, outliers.csv, skew_compare.csv, cdf_ks.csv')
print('[DONE] 분석 완료 - 확장 변수 반영 (소표본 안전 처리 적용)')

[INFO] Visualization timestamp TS=20251008_225309


  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])
  ax.set_xticklabels(['0','1'])


[SAVE] summary_stats.csv, stat_tests.csv, outliers.csv, skew_compare.csv, cdf_ks.csv
[DONE] 분석 완료 - 확장 변수 반영 (소표본 안전 처리 적용)


In [94]:
# 추가: 변수별 통합 패널 시각화 (Hist+KDE, Box, CDF, LogHist)
from pathlib import Path
from datetime import datetime
PANEL_DIR = Path('outputs/notebook_stats')
PANEL_DIR.mkdir(parents=True, exist_ok=True)

# 동일 실행 세션 타임스탬프(앞선 셀에서 TS 정의되어 있으면 재사용, 없으면 새로 생성)
try:
    TS  # noqa: F821
except NameError:
    from datetime import datetime as _dt
    TS = _dt.now().strftime('%Y%m%d_%H%M%S')
print(f'[INFO] Panel timestamp TS={TS}')

panel_vars = [v for v in ['area_sqm','owners','total_households'] if v in df.columns]

def plot_variable_panel(var: str):
    import numpy as _np
    data_all = df[var].dropna()
    if len(data_all) == 0:
        print(f'[SKIP] {var} 값 없음')
        return
    data_fast = df.loc[df['is_fast'], var].dropna()
    # 준비
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
    ax_hist, ax_box, ax_cdf, ax_log = axes.ravel()

    # 1) Hist + KDE
    sns.histplot(data=df, x=var, hue='is_fast', kde=True, stat='density', common_norm=False,
                 bins=30, alpha=0.35, ax=ax_hist)
    ax_hist.set_title(f'[Hist+KDE] {var}')
    ax_hist.set_xlabel(var)
    handles, labels = ax_hist.get_legend_handles_labels()
    if labels:
        ax_hist.legend(handles, ['False','True'], title='is_fast', loc='best')

    # 2) Box
    sns.boxplot(data=df, x='is_fast', y=var, ax=ax_box)
    ax_box.set_title(f'[Box] {var}')
    ax_box.set_xlabel('신속통합 여부')
    ax_box.set_xticklabels(['False','True'])

    # 3) CDF
    data_all_sorted = data_all.sort_values()
    cdf_all = _np.arange(1, len(data_all_sorted)+1)/len(data_all_sorted)
    ax_cdf.plot(data_all_sorted, cdf_all, label='전체', color='C0')
    if len(data_fast):
        data_fast_sorted = data_fast.sort_values()
        cdf_fast = _np.arange(1, len(data_fast_sorted)+1)/len(data_fast_sorted)
        ax_cdf.plot(data_fast_sorted, cdf_fast, label='신속', color='C1')
    from scipy import stats as _stats
    if len(data_fast) > 2:
        ks_stat, ks_p = _stats.ks_2samp(data_all, data_fast)
        ax_cdf.set_title(f'[CDF] {var} (KS p={ks_p:.3g})')
    else:
        ax_cdf.set_title(f'[CDF] {var}')
    ax_cdf.set_xlabel(var)
    ax_cdf.set_ylabel('누적비율')
    ax_cdf.legend(loc='best')

    # 4) Log Hist (if positive)
    if (data_all > 0).sum() > 0:
        log_col = _np.log1p(data_all.clip(lower=0))
        bins = 30
        ax_log.hist(log_col, bins=bins, alpha=0.6, label='전체', color='C0')
        if len(data_fast):
            log_fast = _np.log1p(data_fast.clip(lower=0))
            ax_log.hist(log_fast, bins=bins, alpha=0.6, label='신속', color='C1')
        ax_log.set_title(f'[Log1p Hist] {var}')
        ax_log.set_xlabel(f'log1p({var})')
        ax_log.legend()
    else:
        ax_log.text(0.5, 0.5, '양수 값 없음\n(Log 변환 생략)', ha='center', va='center')
        ax_log.set_axis_off()

    # 폰트 강제 적용
    apply_korean_font(fig)

    fig.suptitle(f'변수 패널: {var}', fontsize=14)
    fig.tight_layout(rect=[0,0,1,0.97])
    out_path = PANEL_DIR / f'panel_{var}_{TS}.png'
    fig.savefig(out_path, dpi=150)
    plt.close(fig)
    print(f'[PANEL] 저장: {out_path}')

for v in panel_vars:
    plot_variable_panel(v)

print('[DONE] 변수별 패널 시각화 생성 완료 (타임스탬프 + 한글 폰트 강제 적용)')

[INFO] Panel timestamp TS=20251008_225309


  ax_box.set_xticklabels(['False','True'])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)


[PANEL] 저장: outputs\notebook_stats\panel_area_sqm_20251008_225309.png


  ax_box.set_xticklabels(['False','True'])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)


[PANEL] 저장: outputs\notebook_stats\panel_owners_20251008_225309.png


  ax_box.set_xticklabels(['False','True'])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])
  fig.tight_layout(rect=[0,0,1,0.97])


[PANEL] 저장: outputs\notebook_stats\panel_total_households_20251008_225309.png
[DONE] 변수별 패널 시각화 생성 완료 (타임스탬프 + 한글 폰트 강제 적용)


  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)
  fig.savefig(out_path, dpi=150)


In [95]:
# 추가: 선택 변수들에 대해 일괄 t-검정 수행 유틸리티
import math
from scipy import stats

# 기존 cohens_d 정의가 위에 있지만, 독립적으로 실행 가능하도록 재정의(이름 충돌 방지)
def cohens_d_independent(a, b):
    a = np.asarray(a); b = np.asarray(b)
    if len(a) < 2 or len(b) < 2:
        return np.nan
    sa, sb = a.std(ddof=1), b.std(ddof=1)
    sp = math.sqrt(((len(a)-1)*sa**2 + (len(b)-1)*sb**2)/(len(a)+len(b)-2)) if (len(a)+len(b)-2)>0 else np.nan
    if sp == 0:
        return 0.0
    return (a.mean() - b.mean())/sp

def hedges_g(d, n1, n2):
    # 작은 표본 보정
    df_ = n1 + n2 - 2
    if df_ <= 0 or np.isnan(d):
        return np.nan
    J = 1 - 3/(4*df_ - 1)
    return d * J

def fdr_bh(pvals):
    p = np.array(pvals, dtype=float)
    n = len(p)
    order = np.argsort(p)
    ranked = p[order]
    adj = ranked * n / (np.arange(n) + 1)
    adj = np.minimum.accumulate(adj[::-1])[::-1]
    # 다시 원래 순서로
    out = np.empty_like(adj)
    out[order] = np.clip(adj, 0, 1)
    return out

def run_ttests(df, variables, group_col='is_fast', alternative='two-sided', force_welch=False,
               min_per_group=3, log1p_if_skew=False, skew_threshold=1.0, apply_fdr=True):
    """
    선택 변수들에 대해 (신속 vs 일반) 독립표본 t-검정(or Welch) 수행.

    Parameters
    ----------
    df : DataFrame (group_col 포함)
    variables : list[str]
        검정할 수치형 변수 목록.
    group_col : str
        True/False (또는 0/1) 구분 컬럼.
    alternative : {'two-sided','greater','less'}
        scipy ttest_ind alternative 인자.
    force_welch : bool
        True면 등분산 가정 무시하고 Welch( equal_var=False ) 강제.
    min_per_group : int
        그룹별 최소 표본수 미만이면 스킵.
    log1p_if_skew : bool
        True면 |skew|>skew_threshold 이고 값이 모두 >=0인 경우 log1p 변환 후 검정.
    skew_threshold : float
        로그 변환 판단 임계치.
    apply_fdr : bool
        다중비교 FDR(BH) 보정 적용.
    """
    records = []
    for v in variables:
        if v not in df.columns:
            print(f'[TT][SKIP] {v} 컬럼 없음')
            continue
        g0 = df.loc[~df[group_col], v].dropna()
        g1 = df.loc[df[group_col], v].dropna()
        n0, n1 = len(g0), len(g1)
        if n0 < min_per_group or n1 < min_per_group:
            print(f'[TT][SKIP] {v} 표본 부족 (nonfast={n0}, fast={n1})')
            continue

        transformed = False
        series_all = pd.concat([g0, g1])
        if log1p_if_skew and series_all.min() >= 0:
            skew_val = series_all.skew()
            if abs(skew_val) > skew_threshold:
                g0 = np.log1p(g0)
                g1 = np.log1p(g1)
                transformed = True
        # 정규성/등분산성 체크 (정보 제공 목적)
        norm0 = stats.shapiro(g0)[1] if len(g0) <= 5000 else 1.0
        norm1 = stats.shapiro(g1)[1] if len(g1) <= 5000 else 1.0
        lev_p = stats.levene(g0, g1, center='median').pvalue
        equal_var = (lev_p > 0.05) and (not force_welch)
        t_res = stats.ttest_ind(g1, g0, equal_var=equal_var, alternative=alternative)
        d = cohens_d_independent(g1, g0)
        g = hedges_g(d, len(g1), len(g0))
        records.append({
            'var': v,
            'n_fast': n1,
            'n_nonfast': n0,
            'mean_fast': g1.mean(),
            'mean_nonfast': g0.mean(),
            'diff_fast-nonfast': g1.mean() - g0.mean(),
            't_stat': t_res.statistic,
            'p_value': t_res.pvalue,
            'levene_p': lev_p,
            'shapiro_p_fast': norm1,
            'shapiro_p_nonfast': norm0,
            'used_welch': not equal_var,
            'log1p_transformed': transformed,
            'cohens_d': d,
            'hedges_g': g
        })
    res = pd.DataFrame(records)
    if apply_fdr and not res.empty:
        res['p_fdr'] = fdr_bh(res['p_value'].values)
    return res

# 사용 예시:
# ttest_vars = ['area_sqm','owners','existing_households']
# ttest_res = run_ttests(df, ttest_vars, log1p_if_skew=True, skew_threshold=1.0, apply_fdr=True)
# ttest_res.sort_values('p_value')

print('[INFO] run_ttests 함수 준비 완료. 예시 실행은 주석 참고 후 셀 아래에서 별도로 호출하세요.')

[INFO] run_ttests 함수 준비 완료. 예시 실행은 주석 참고 후 셀 아래에서 별도로 호출하세요.


In [96]:
ttest_vars = ['area_sqm','owners','existing_households','coverage_ratio','floor_area_ratio','rental_ratio']
ttest_res = run_ttests(
    df,
    ttest_vars,
    log1p_if_skew=True,   # 왜도 크면 log1p 자동 변환
    skew_threshold=1.0,   # |skew| > 1 이면 변환
    apply_fdr=True,       # FDR(BH) 보정
    force_welch=False     # True면 항상 Welch
)
ttest_res.sort_values('p_value')

[TT][SKIP] existing_households 컬럼 없음


Unnamed: 0,var,n_fast,n_nonfast,mean_fast,mean_nonfast,diff_fast-nonfast,t_stat,p_value,levene_p,shapiro_p_fast,shapiro_p_nonfast,used_welch,log1p_transformed,cohens_d,hedges_g,p_fdr
2,coverage_ratio,25,117,3.495971,3.299314,0.196657,2.148282,0.040063,0.006091,0.002819227,2.575825e-06,True,True,0.589017,0.585856,0.200314
0,area_sqm,25,120,11.144415,10.806485,0.33793,1.750606,0.082158,0.054009,0.7850086,0.02387278,False,True,0.384868,0.382846,0.205396
3,floor_area_ratio,25,119,262.8312,253.428151,9.403049,0.812964,0.4176,0.952103,0.0260062,2.700319e-08,False,False,0.178858,0.177912,0.642261
1,owners,25,120,6.275157,6.100458,0.174698,0.65455,0.513809,0.93871,9.066217e-07,8.84963e-11,False,True,0.143902,0.143146,0.642261
4,rental_ratio,25,98,0.174896,0.171136,0.00376,0.258851,0.79619,0.122348,0.6459992,1.406745e-06,False,False,0.057999,0.057639,0.79619


In [97]:
ttest_res.to_csv('outputs/notebook_stats/ttest_results.csv', index=False, encoding='utf-8-sig')