In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("data/offline/test_raw.parquet")
print(df.shape, df["fraud"].mean(), df["fraud"].sum())

In [None]:
df_f = df[df["fraud"] == 1].copy()
print("fraud-only:", df_f.shape)

In [None]:
from OFFLINE.features.client_state import add_client_state_feature

class Cfg:
    client_col = "client_id"

df_f_feat = add_client_state_feature(df_f, Cfg)
print("fraud-only + client_state:", df_f_feat.shape)

In [None]:
from OFFLINE.models.type_package import TypeDiscoveryConfig, make_type_discovery_matrix

# 1) label/id 제외 전부 쓰고 싶으면 feature_cols=None 그대로 두면 됨
cfg = TypeDiscoveryConfig(
    label_col="fraud",
    id_cols=("client_id","card_id","merchant_id"),
    feature_cols=None,         # 전체 사용 (먼저 돌려보고, 이후 줄이는 걸 추천)
    use_scaler=True,
    fillna_value=0.0,

    # GMM
    gmm_k_min=2,
    gmm_k_max=12,
    gmm_covariance_type="full",
    compute_silhouette=True,
    silhouette_sample_size=30000,

    # HDBSCAN
    hdb_min_cluster_size=50,
    hdb_min_samples=None
)

X, feat_cols, scaler = make_type_discovery_matrix(df_f_feat, cfg)
print(X.shape, len(feat_cols))


In [None]:
from OFFLINE.models.type_package import fit_gmm_best, profile_clusters, save_type_package

gmm, best_k, gmm_scores = fit_gmm_best(X, cfg)
print("best_k:", best_k)
gmm_scores


In [None]:
labels_gmm = gmm.predict(X)  # hard label
proba_gmm = gmm.predict_proba(X)  # soft membership (너희 프로젝트에 매우 유리)

report_gmm = {
    "best_k": int(best_k),
    "gmm_scores": gmm_scores.to_dict(orient="records"),
}

profile_gmm = profile_clusters(df_f_feat, labels_gmm, feat_cols, topn=12)
report_gmm["profile"] = profile_gmm

# 저장
out_dir = f"artifacts/type_package_v1/gmm"
save_type_package(
    out_dir=out_dir,
    algo="gmm",
    model=gmm,
    scaler=scaler,
    feature_cols=feat_cols,
    cfg=cfg,
    extra=report_gmm,
)
print("saved:", out_dir)


In [None]:
from OFFLINE.models.type_package import fit_hdbscan

hdb, labels_hdb = fit_hdbscan(X, cfg)
print("unique labels:", np.unique(labels_hdb)[:20], " ...")
print("n_noise:", (labels_hdb==-1).sum(), "noise_ratio:", (labels_hdb==-1).mean())


In [None]:
report_hdb = {
    "hdbscan_params": {
        "min_cluster_size": cfg.hdb_min_cluster_size,
        "min_samples": cfg.hdb_min_samples,
        "metric": cfg.hdb_metric,
        "cluster_selection_method": cfg.hdb_cluster_selection_method,
    }
}
profile_hdb = profile_clusters(df_f_feat, labels_hdb, feat_cols, topn=12)
report_hdb["profile"] = profile_hdb

out_dir = f"artifacts/type_package_v1/hdbscan"
save_type_package(
    out_dir=out_dir,
    algo="hdbscan",
    model=hdb,
    scaler=scaler,
    feature_cols=feat_cols,
    cfg=cfg,
    extra=report_hdb,
)
print("saved:", out_dir)


In [None]:
import numpy as np
import pandas as pd

from OFFLINE.models.type_package import fit_hdbscan, profile_clusters

cands = [20, 30, 50, 80, 120, 200]
rows = []

for mcs in cands:
    cfg2 = cfg
    cfg2.hdb_min_cluster_size = mcs

    hdb, labels = fit_hdbscan(X, cfg2)
    prof = profile_clusters(df_f_feat, labels, feat_cols, topn=8)
    rows.append({
        "min_cluster_size": mcs,
        "n_clusters_ex_noise": prof["n_clusters_ex_noise"],
        "noise_ratio": prof["noise_ratio"],
        "largest_cluster": max(prof["cluster_counts"].values()),
    })

pd.DataFrame(rows).sort_values(["noise_ratio","n_clusters_ex_noise"])
