In [2]:
import pandas as pd
import glob
import os

# =========================================================
# 0. 기본 설정
# =========================================================

DATA_DIR = "../food"
CHUNKSIZE = 200_000

USECOLS = [
    "sex",
    "age",
    "hour",
    "day",
    "card_tpbuz_nm_2",
    "amt",
    "cnt"
]

files = sorted(glob.glob(os.path.join(DATA_DIR, "*.csv")))
print(f"총 파일 수: {len(files)}")

# =========================================================
# 1. 누적용 딕셔너리 초기화
# =========================================================

sex_stats = {}
age_stats = {}
category_stats = {}
hour_stats = {}
day_stats = {}

total_rows = 0

# =========================================================
# 2. 파일 + chunk 순회
# =========================================================

for file in files:
    print(f"Processing: {os.path.basename(file)}")

    for chunk in pd.read_csv(
        file,
        usecols=USECOLS,
        chunksize=CHUNKSIZE
    ):
        total_rows += len(chunk)

        def accumulate(group_col, target_dict):
            grouped = chunk.groupby(group_col)[["amt", "cnt"]].sum()
            for idx, row in grouped.iterrows():
                if idx not in target_dict:
                    target_dict[idx] = {"amt": 0, "cnt": 0}
                target_dict[idx]["amt"] += row["amt"]
                target_dict[idx]["cnt"] += row["cnt"]

        accumulate("sex", sex_stats)
        accumulate("age", age_stats)
        accumulate("card_tpbuz_nm_2", category_stats)
        accumulate("hour", hour_stats)
        accumulate("day", day_stats)

# =========================================================
# 3. DataFrame 변환 + 비율 계산
# =========================================================

def to_df(stats_dict):
    df = pd.DataFrame.from_dict(stats_dict, orient="index")
    df["amt_ratio"] = df["amt"] / df["amt"].sum()
    df["cnt_ratio"] = df["cnt"] / df["cnt"].sum()
    return df

df_sex = to_df(sex_stats)
df_age = to_df(age_stats)
df_category = to_df(category_stats)
df_hour = to_df(hour_stats)
df_day = to_df(day_stats)

print("편향성 집계 완료")


총 파일 수: 47
Processing: 202201.csv
Processing: 202202.csv
Processing: 202203.csv
Processing: 202204.csv
Processing: 202205.csv
Processing: 202206.csv
Processing: 202207.csv
Processing: 202208.csv
Processing: 202209.csv
Processing: 202210.csv
Processing: 202211.csv
Processing: 202212.csv
Processing: 202301.csv
Processing: 202302.csv
Processing: 202303.csv
Processing: 202304.csv
Processing: 202305.csv
Processing: 202306.csv
Processing: 202307.csv
Processing: 202308.csv
Processing: 202309.csv
Processing: 202310.csv
Processing: 202311.csv
Processing: 202312.csv
Processing: 202401.csv
Processing: 202402.csv
Processing: 202403.csv
Processing: 202404.csv
Processing: 202405.csv
Processing: 202406.csv
Processing: 202407.csv
Processing: 202408.csv
Processing: 202409.csv
Processing: 202410.csv
Processing: 202411.csv
Processing: 202412.csv
Processing: 202501.csv
Processing: 202502.csv
Processing: 202503.csv
Processing: 202504.csv
Processing: 202505.csv
Processing: 202506.csv
Processing: 202507.csv


In [3]:
SAVE_DIR = "../eda_bias_result"
os.makedirs(SAVE_DIR, exist_ok=True)

df_sex.to_csv(f"{SAVE_DIR}/bias_sex_amt_cnt.csv", encoding="utf-8-sig")
df_age.to_csv(f"{SAVE_DIR}/bias_age_amt_cnt.csv", encoding="utf-8-sig")
df_category.to_csv(f"{SAVE_DIR}/bias_category_amt_cnt.csv", encoding="utf-8-sig")
df_hour.to_csv(f"{SAVE_DIR}/bias_hour_amt_cnt.csv", encoding="utf-8-sig")
df_day.to_csv(f"{SAVE_DIR}/bias_day_amt_cnt.csv", encoding="utf-8-sig")

print("CSV 저장 완료")

CSV 저장 완료
