In [5]:
import os, io, base64, warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

OUTPUT_DIR = "./eda_reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------------
# 공통 유틸
# ------------------------------
def fig_to_base64():
    import matplotlib.pyplot as plt
    buf = io.BytesIO()
    plt.tight_layout()
    plt.savefig(buf, format="png", bbox_inches="tight")
    plt.close()
    buf.seek(0)
    return base64.b64encode(buf.read()).decode("ascii")

EXCLUDE_COLS = {"base_date", "create_at"}

# ------------------------------
# EDA 함수들
# ------------------------------
def drop_excluded(df: pd.DataFrame, exclude: set) -> pd.DataFrame:
    cols = [c for c in df.columns if c not in exclude]
    return df[cols].copy()

def missingness_overview(df: pd.DataFrame):
    return df.isna().mean().sort_values(ascending=False)

def make_missingness_chart(df: pd.DataFrame, top_k=30):
    import matplotlib.pyplot as plt
    miss = df.isna().mean().sort_values(ascending=False).head(top_k)
    if miss.empty or (miss == 0).all():
        return None
    plt.figure(figsize=(8, max(3, 0.25*len(miss))))
    miss.iloc[::-1].plot(kind="barh")
    plt.title("Top Missingness by Column")
    return fig_to_base64()

def numeric_summary(df: pd.DataFrame):
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    if not num_cols: return pd.DataFrame()
    desc = df[num_cols].describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T
    desc["missing_rate"] = df[num_cols].isna().mean()
    return desc.reset_index(names="column")

def cat_summary(df: pd.DataFrame, top_k=15):
    cat_cols = [c for c in df.columns if not pd.api.types.is_numeric_dtype(df[c])]
    rows = []
    for c in cat_cols:
        vc = df[c].value_counts(dropna=False)
        total = int(vc.sum())
        if total == 0:
            rows.append({"column":c,"cardinality":0,"dominant":None,"dominant_rate":0,"rare_counts":{}})
            continue
        dom_val, dom_cnt = vc.index[0], int(vc.iloc[0])
        rare = vc[vc/total < 0.001]
        rows.append({
            "column": c,
            "cardinality": int(df[c].nunique(dropna=True)),
            "dominant": str(dom_val),
            "dominant_rate": round(dom_cnt/total,4),
            "missing_rate": float(df[c].isna().mean()),
            "top_values": vc.head(top_k).to_dict(),
            "rare_counts": rare.to_dict()
        })
    return pd.DataFrame(rows)

def make_histograms(df: pd.DataFrame, max_plots=6):
    import matplotlib.pyplot as plt
    charts=[]
    for c in [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])][:max_plots]:
        s = pd.to_numeric(df[c], errors="coerce").dropna()
        if s.empty: continue
        plt.figure()
        plt.hist(s, bins=30)
        plt.title(f"Distribution • {c}")
        charts.append({"title":f"Distribution • {c}","img":fig_to_base64()})
    return charts

def make_categorical_bars(df: pd.DataFrame, max_plots=6):
    import matplotlib.pyplot as plt
    charts=[]
    for c in [c for c in df.columns if not pd.api.types.is_numeric_dtype(df[c])][:max_plots]:
        vc = df[c].astype(str).value_counts().head(15)
        if vc.empty: continue
        plt.figure(figsize=(6,3))
        vc.plot(kind="bar")
        plt.title(f"Top categories • {c}")
        charts.append({"title":f"Top categories • {c}","img":fig_to_base64()})
    return charts

def correlation_tables(df: pd.DataFrame, max_cols=50):
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])][:max_cols]
    if len(num_cols)<2: return pd.DataFrame(), pd.DataFrame()
    return df[num_cols].corr("pearson"), df[num_cols].corr("spearman")

def detect_negative_suspects(df: pd.DataFrame):
    suspects=[]
    for c in [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]:
        x = pd.to_numeric(df[c], errors="coerce").dropna()
        if x.empty: continue
        ge0 = (x>=0).mean()
        neg = int((x<0).sum())
        if ge0>=0.9 and neg>0:
            suspects.append({"column":c,"negative_count":neg,"ge0_ratio":round(ge0,3)})
    return pd.DataFrame(suspects)

def detect_outliers_iqr(df: pd.DataFrame, max_cols=30):
    rows=[]
    for c in [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])][:max_cols]:
        x = pd.to_numeric(df[c], errors="coerce").dropna()
        if len(x)<30: continue
        q1,q3=np.percentile(x,[25,75]); iqr=q3-q1
        if iqr<=0: continue
        lower,upper=q1-1.5*iqr,q3+1.5*iqr
        out=int(((x<lower)|(x>upper)).sum())
        if out>0:
            rows.append({"column":c,"outliers":out,"rate":round(out/len(x),4)})
    return pd.DataFrame(rows)

# ------------------------------
# HTML Report Generator
# ------------------------------
BASE_CSS = "<style>body{font-family:sans-serif;margin:20px;}h1{margin:0;}table{border-collapse:collapse;width:100%;margin:10px 0;}th,td{border:1px solid #ccc;padding:6px;}</style>"

def df_to_html(df: pd.DataFrame):
    if df is None or df.empty: return "<em>No data</em>"
    return df.to_html(index=False, escape=False)

def render_report_for_dataframe(df_raw: pd.DataFrame, table_name: str, output_dir: str=OUTPUT_DIR):
    df = drop_excluded(df_raw, EXCLUDE_COLS)
    miss_tbl = missingness_overview(df); miss_chart = make_missingness_chart(df)
    num_sum = numeric_summary(df); cat_sum = cat_summary(df)
    num_hists = make_histograms(df); cat_bars = make_categorical_bars(df)
    pearson,spearman = correlation_tables(df)
    neg_df = detect_negative_suspects(df); out_df = detect_outliers_iqr(df)

    html = f"<!DOCTYPE html><html><head><meta charset='utf-8'><title>{table_name} EDA</title>{BASE_CSS}</head><body>"
    html += f"<h1>Expert EDA Report</h1><h2>{table_name}</h2><small>Generated {datetime.now()}</small><hr>"

    html += "<h3>Missingness</h3>"+df_to_html(miss_tbl.reset_index().rename(columns={"index":"column",0:"missing_rate"}))
    if miss_chart: html+=f"<img src='data:image/png;base64,{miss_chart}' style='width:60%'>"

    html += "<h3>Numeric Summary</h3>"+df_to_html(num_sum)
    for h in num_hists: html+=f"<div><b>{h['title']}</b><br><img src='data:image/png;base64,{h['img']}' style='width:60%'></div>"

    html += "<h3>Categorical Summary</h3>"+df_to_html(cat_sum)
    for b in cat_bars: html+=f"<div><b>{b['title']}</b><br><img src='data:image/png;base64,{b['img']}' style='width:60%'></div>"

    if not pearson.empty: html+="<h3>Pearson Correlation (head)</h3>"+df_to_html(pearson.head())
    if not spearman.empty: html+="<h3>Spearman Correlation (head)</h3>"+df_to_html(spearman.head())

    html += "<h3>Negative-suspect Columns</h3>"+df_to_html(neg_df)
    html += "<h3>Outlier (IQR) Scan</h3>"+df_to_html(out_df)

    html += f"<hr><small>Excluded columns: {', '.join(EXCLUDE_COLS)}</small></body></html>"

    path=os.path.join(output_dir,f"{table_name.replace('.','_')}_eda.html")
    with open(path,"w",encoding="utf-8") as f: f.write(html)
    return path

# ------------------------------
# 샘플 데이터 생성
# ------------------------------
def make_sample_table(seed: int, n_rows=10000) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    df=pd.DataFrame({
        "account_id": rng.integers(10000,20000,n_rows).astype(str),
        "base_date": pd.to_datetime("2024-01-01")+pd.to_timedelta(rng.integers(0,180,n_rows),unit="D"),
        "create_at": pd.to_datetime("2024-01-01")+pd.to_timedelta(rng.integers(0,200,n_rows),unit="D"),
        "age": rng.normal(40,10,n_rows),
        "days_since_login": rng.integers(0,365,n_rows),
        "amt": rng.gamma(2,30,n_rows),
        "gender": rng.choice(["M","F","U"],n_rows,p=[.48,.48,.04]),
        "os": rng.choice(["Android","iOS","ETC"],n_rows,p=[.7,.25,.05]),
    })
    # 음수 삽입
    df.loc[rng.choice(n_rows,50,replace=False),"amt"]*=-1
    # 결측 삽입
    df.loc[rng.choice(n_rows,80,replace=False),"gender"]=np.nan
    return df

# ------------------------------
# 실행
# ------------------------------
if __name__=="__main__":
    TABLES=["feature_store_data_d"]
    for i,t in enumerate(TABLES,1):
        df=get_data_impala(t)
        out=render_report_for_dataframe(df,t,OUTPUT_DIR)
        print(f"[OK] Report saved → {out}")


[OK] Report saved → ./eda_reports/feature_store_data_d_eda.html
