## code lama

In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Optional, Tuple
from kmodes.kprototypes import KPrototypes

In [23]:
def prepare_for_kprototypes(
    df: pd.DataFrame,
    categorical_cols: Optional[List[str]] = None,
    numeric_cols: Optional[List[str]] = None,
    force_cat_as_str: bool = True,
) -> Tuple[pd.DataFrame, List[int], List[str], List[str]]:
    df = df.copy()

    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=["object", "string", "category"]).columns.tolist()
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    if force_cat_as_str:
        for c in categorical_cols:
            if c in df.columns:
                df[c] = df[c].astype("string").astype(object)

    cat_idx = [df.columns.get_loc(c) for c in categorical_cols if c in df.columns]
    return df, cat_idx, categorical_cols, numeric_cols

In [24]:
def auto_gamma(df: pd.DataFrame, numeric_cols: List[str]) -> float:
    """Heuristik sederhana untuk gamma (penyeimbang numerik vs kategorikal)."""
    if not numeric_cols:
        return 1.0
    stds = [df[c].astype(float).std(ddof=0) for c in numeric_cols if c in df.columns]
    g = float(np.nanmean(stds)) if len(stds) else np.nan
    return 1.0 if (np.isnan(g) or g == 0) else g

In [None]:

def choose_k_by_elbow(k_vals: np.ndarray, costs: np.ndarray) -> int:
    k_vals = np.asarray(k_vals, dtype=float)
    costs  = np.asarray(costs, dtype=float)

    # jika kurang dari 2 titik, default ke K minimum
    if k_vals.size < 2:
        return int(k_vals[0])

    if k_vals.size >= 3:
        # normalisasi ke [0,1]
        k_norm = (k_vals - k_vals.min()) / (k_vals.max() - k_vals.min() + 1e-12)
        c_norm = (costs  - costs.min()) / (costs.max() - costs.min() + 1e-12)
        p1 = np.array([k_norm[0],  c_norm[0]])
        p2 = np.array([k_norm[-1], c_norm[-1]])
        v  = p2 - p1
        v_len = np.linalg.norm(v) + 1e-12

        pts = np.stack([k_norm, c_norm], axis=1)
        dists = np.abs(np.cross(v, pts - p1)) / v_len
        dists[0]  = -np.inf
        dists[-1] = -np.inf
        idx = int(np.nanargmax(dists))
        return int(k_vals[idx])

    
    deltas = -np.diff(costs)  
    idx = int(np.nanargmax(deltas))
    return int(k_vals[idx + 1])

In [None]:
FILE_PATH  = r"kontrak-sewa-for-clustering.xlsx"  
SHEET_NAME = "Sheet1"  

CATEGORICAL_COLS = [
    "BusinessType","LeaseYearStart","LeaseMonthStart","LeaseDayStart",
    "LeaseYearEnd","LeaseMonthEnd","LeaseDayEnd","TranCode",
    "ContractPeriod","ContractType","Building","UnitArea","UnitFloor"
]
NUMERIC_COLS     = ["BuildingArea","LeaseDurationDays","LeaseDurationMonths","n_subunit"]


K_RANGE = range(2, 7)
DROP_COLS = ["UnitNum"]


# Load & Persiapan Data
df_raw = pd.read_excel(FILE_PATH, sheet_name=SHEET_NAME)
df_raw = df_raw.drop(columns=[c for c in DROP_COLS if c in df_raw.columns])

for col in NUMERIC_COLS:
    if col in df_raw.columns:
        df_raw[col] = pd.to_numeric(df_raw[col], errors="coerce").astype(float)

# kolom yg tidak terdaftar masuk ke kategori
listed = set(CATEGORICAL_COLS) | set(NUMERIC_COLS)
CATEGORICAL_COLS = [c for c in CATEGORICAL_COLS if c in df_raw.columns]
NUMERIC_COLS     = [c for c in NUMERIC_COLS if c in df_raw.columns]


df_prep, cat_idx, cat_cols, num_cols = prepare_for_kprototypes(
    df_raw,
    categorical_cols=CATEGORICAL_COLS,
    numeric_cols=NUMERIC_COLS,
    force_cat_as_str=True,
)

gamma = auto_gamma(df_prep, num_cols)
X = df_prep.to_numpy()

In [27]:
elbow_rows = []
for k in K_RANGE:
    mk = KPrototypes(
        n_clusters=int(k), init="Huang", n_init=5, max_iter=100,
        random_state=42, gamma=gamma, verbose=0
    )
    mk.fit_predict(X, categorical=cat_idx)
    elbow_rows.append({"k": int(k), "cost": float(mk.cost_), "n_iter": int(mk.n_iter_)})

elbow_tbl = pd.DataFrame(elbow_rows).sort_values("k").reset_index(drop=True)
print("\n=== Elbow (cost vs k) ===")
print(elbow_tbl)

FINAL_K = choose_k_by_elbow(elbow_tbl["k"].to_numpy(), elbow_tbl["cost"].to_numpy())
print(f"\nFINAL_K yang dipilih= {FINAL_K}")


=== Elbow (cost vs k) ===
   k          cost  n_iter
0  2  2.098539e+17       5
1  3  1.022850e+17      17
2  4  6.317248e+16      11
3  5  4.417157e+16       6
4  6  3.802799e+16       8

FINAL_K yang dipilih= 3


  dists = np.abs(np.cross(v, pts - p1)) / v_len


In [28]:
model_final = KPrototypes(
    n_clusters=int(FINAL_K),
    init="Huang",
    n_init=5,
    max_iter=100,
    random_state=42,
    gamma=gamma,
    verbose=0,
)

labels = model_final.fit_predict(X, categorical=cat_idx)

df_out = df_raw.copy()
df_out["cluster"] = labels  

if "CuryUnitPrice" in df_out.columns:
    df_out = df_out.drop(columns=["CuryUnitPrice"])

#CuryUnitPrice
price_sheet_df = (
    df_raw.reset_index()[["index","CuryUnitPrice"]]
    .rename(columns={"index": "RowID"})
    if "CuryUnitPrice" in df_raw.columns else
    pd.DataFrame(columns=["RowID","CuryUnitPrice"])
)

in_path  = Path(FILE_PATH)
out_path = in_path.with_name(f"{in_path.stem}_with_cluster{in_path.suffix}")

with pd.ExcelWriter(out_path, engine="openpyxl") as ew:
    sheet_main = SHEET_NAME if SHEET_NAME is not None else "Sheet1"
    df_out.to_excel(ew, index=False, sheet_name=sheet_main)
    price_sheet_df.to_excel(ew, index=False, sheet_name="CuryUnitPrice")

print(f"Selesai. Hasil tersimpan di: {out_path}")
print("Fitur kategorikal:", cat_cols)
print("Fitur numerik    :", num_cols)
print(f"Jumlah cluster   : {FINAL_K}")

Selesai. Hasil tersimpan di: kontrak-sewa-for-clustering_with_cluster.xlsx
Fitur kategorikal: ['BusinessType', 'LeaseYearStart', 'LeaseMonthStart', 'LeaseDayStart', 'LeaseYearEnd', 'LeaseMonthEnd', 'LeaseDayEnd', 'TranCode', 'ContractPeriod', 'ContractType', 'Building', 'UnitArea', 'UnitFloor']
Fitur numerik    : ['BuildingArea', 'LeaseDurationDays', 'LeaseDurationMonths', 'n_subunit']
Jumlah cluster   : 3


## code lengkap 

In [3]:
from kmodes.kprototypes import KPrototypes
from pathlib import Path
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
from typing import List, Optional, Tuple, Dict, Any
from collections import Counter
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import silhouette_score

In [None]:
def _impute_numeric(
    s: pd.Series,
    strategy: str = "median"
) -> pd.Series:
    if strategy == "median":
        return s.fillna(s.median())
    elif strategy == "mean":
        return s.fillna(s.mean())
    elif strategy == "zero":
        return s.fillna(0)
    else:
        return s

def _clip_outliers_quantile(
    s: pd.Series,
    q_low: float = 0.01,
    q_high: float = 0.99
) -> pd.Series:
    lo, hi = s.quantile(q_low), s.quantile(q_high)
    if pd.isna(lo) or pd.isna(hi):
        return s
    return s.clip(lo, hi)

def _consolidate_rare_categories(
    s: pd.Series,
    min_ratio: float = 0.01,
    other_label: str = "OTHER"
) -> pd.Series:
    # bekerja pada string/object
    counts = s.value_counts(dropna=False)
    total = counts.sum()
    rare = set(counts[counts / max(total, 1) <= min_ratio].index)
    return s.apply(lambda v: other_label if v in rare else v)

def _scale_numeric(
    df_num: pd.DataFrame,
    method: str = "minmax"
):
    if df_num.empty:
        return df_num, None
    if method == "standard":
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    scaled = pd.DataFrame(
        scaler.fit_transform(df_num.values),
        columns=df_num.columns,
        index=df_num.index
    )
    return scaled, scaler

def preprocess_for_kprototypes(
    df: pd.DataFrame,
    categorical_cols: List[str],
    numeric_cols: List[str],
    *,
    fill_num: str = "median",            
    fill_cat_label: str = "MISSING",     
    rare_thresh: float = 0.01,           
    other_label: str = "OTHER",
    clip_outliers: bool = True,
    q_low: float = 0.01,
    q_high: float = 0.99,
    scale_method: str = "minmax",        
    force_upper: bool = False,           
    strip_space: bool = True
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Mengembalikan df yang sudah siap untuk K-Prototypes + artifacts (scaler, mapping).
    """
    dfp = df.copy()

    # cek kolom
    categorical_cols = [c for c in categorical_cols if c in dfp.columns]
    numeric_cols     = [c for c in numeric_cols if c in dfp.columns]

    # numerik: coerce, imputasi, clipping, scaling
    if numeric_cols:
        for c in numeric_cols:
            dfp[c] = pd.to_numeric(dfp[c], errors="coerce")
            if fill_num != "none":
                dfp[c] = _impute_numeric(dfp[c], strategy=fill_num)
            if clip_outliers:
                dfp[c] = _clip_outliers_quantile(dfp[c], q_low=q_low, q_high=q_high)

        scaled_block, scaler = _scale_numeric(dfp[numeric_cols], method=scale_method)
        dfp[numeric_cols] = scaled_block.values
    else:
        scaler = None

    # kategorikal: cast ke string object, normalisasi teks, imputasi NA, rare merge
    for c in categorical_cols:
        dfp[c] = dfp[c].astype("string")  # string dtype
        if strip_space:
            dfp[c] = dfp[c].str.strip()
        if force_upper:
            dfp[c] = dfp[c].str.upper()
        dfp[c] = dfp[c].fillna(fill_cat_label)
        # jika ada string kosong setelah strip, ganti ke MISSING
        dfp[c] = dfp[c].replace({"": fill_cat_label})

        if rare_thresh is not None and rare_thresh > 0:
            dfp[c] = _consolidate_rare_categories(dfp[c], min_ratio=rare_thresh, other_label=other_label)

        
        dfp[c] = dfp[c].astype(object)

    artifacts = {
        "scaler": scaler,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "scale_method": scale_method,
        "fill_num": fill_num,
        "fill_cat_label": fill_cat_label,
        "rare_thresh": rare_thresh,
        "other_label": other_label,
        "clip_outliers": clip_outliers,
        "q_low": q_low,
        "q_high": q_high,
        "force_upper": force_upper,
        "strip_space": strip_space,
    }
    return dfp, artifacts


def prepare_for_kprototypes(
    df: pd.DataFrame,
    categorical_cols: Optional[List[str]] = None,
    numeric_cols: Optional[List[str]] = None,
    force_cat_as_str: bool = True,
    *,
    preprocess: bool = True,
    fill_num: str = "median",
    fill_cat_label: str = "MISSING",
    rare_thresh: float = 0.01,
    other_label: str = "OTHER",
    clip_outliers: bool = True,
    q_low: float = 0.01,
    q_high: float = 0.99,
    scale_method: str = "minmax",
    force_upper: bool = False,
    strip_space: bool = True,
) -> Tuple[pd.DataFrame, List[int], List[str], List[str]]:
   
    df_local = df.copy()

    if categorical_cols is None:
        categorical_cols = df_local.select_dtypes(
            include=["object", "string", "category"]
        ).columns.tolist()
    if numeric_cols is None:
        numeric_cols = df_local.select_dtypes(include=[np.number]).columns.tolist()


    for c in df_local.columns:
        if c not in numeric_cols and c not in categorical_cols:
            continue
        if pd.api.types.is_integer_dtype(df_local[c]) and df_local[c].nunique(dropna=True) <= 20:
            if c in numeric_cols:
                numeric_cols.remove(c)
            if c not in categorical_cols:
                categorical_cols.append(c)

    
    if force_cat_as_str:
        for c in categorical_cols:
            if c in df_local.columns:
                df_local[c] = df_local[c].astype("string").astype(object)

    
    if preprocess:
        df_local, _art = preprocess_for_kprototypes(
            df_local,
            categorical_cols=categorical_cols,
            numeric_cols=numeric_cols,
            fill_num=fill_num,
            fill_cat_label=fill_cat_label,
            rare_thresh=rare_thresh,
            other_label=other_label,
            clip_outliers=clip_outliers,
            q_low=q_low,
            q_high=q_high,
            scale_method=scale_method,
            force_upper=force_upper,
            strip_space=strip_space,
        )

   
    cat_idx = [df_local.columns.get_loc(c) for c in categorical_cols if c in df_local.columns]
    return df_local, cat_idx, categorical_cols, numeric_cols


def auto_gamma(df: pd.DataFrame, numeric_cols: List[str]) -> float:
    if not numeric_cols:
        return 1.0
    stds = [pd.to_numeric(df[c], errors="coerce").astype(float).std(ddof=0) for c in numeric_cols if c in df.columns]
    g = float(np.nanmean(stds)) if len(stds) else np.nan
    return 1.0 if (np.isnan(g) or g == 0) else g


def choose_k_by_elbow(k_vals: np.ndarray, costs: np.ndarray) -> int:
    k_vals = np.asarray(k_vals, dtype=float)
    costs = np.asarray(costs, dtype=float)

    # print per iterasi
    print("Iterasi K dan Cost:")
    for k, c in zip(k_vals, costs):
        print(f"  k = {k:.0f}, cost = {c}")
    print("-----------------------------------")

    if k_vals.size < 2:
        return int(k_vals[0])

    if k_vals.size >= 3:
        k_norm = (k_vals - k_vals.min()) / (k_vals.max() - k_vals.min() + 1e-12)
        c_norm = (costs - costs.min()) / (costs.max() - costs.min() + 1e-12)

        p1 = np.array([k_norm[0], c_norm[0]])
        p2 = np.array([k_norm[-1], c_norm[-1]])
        v = p2 - p1
        v_len = np.linalg.norm(v) + 1e-12

        pts = np.stack([k_norm, c_norm], axis=1)

        
        dists = np.abs(
            v[0] * (p1[1] - pts[:, 1]) - 
            v[1] * (p1[0] - pts[:, 0])
        ) / v_len

        
        dists[0] = -np.inf
        dists[-1] = -np.inf

        idx = int(np.nanargmax(dists))
        print(f"Elbow ditemukan pada k = {int(k_vals[idx])} dengan jarak = {dists[idx]}")
        return int(k_vals[idx])

    
    deltas = -np.diff(costs)
    idx = int(np.nanargmax(deltas))
    print(f"Elbow (2 titik) pada k = {int(k_vals[idx + 1])}")
    return int(k_vals[idx + 1])



MAX_SILHOUETTE_SAMPLES = 2000
RANDOM_STATE_SIL = 42

def _split_cols_positions(n_total_cols: int, cat_idx: List[int]):
    cat_idx = np.array(sorted(cat_idx), dtype=int)
    all_idx = np.arange(n_total_cols, dtype=int)
    num_idx = np.setdiff1d(all_idx, cat_idx, assume_unique=True)
    return num_idx, cat_idx

def _kproto_pairwise_dissim_matrix(X: np.ndarray, cat_idx: List[int], gamma: float) -> np.ndarray:
    m, p = X.shape
    num_idx, cat_idx_arr = _split_cols_positions(p, cat_idx)

    if num_idx.size > 0:
        Xn = X[:, num_idx].astype(float)
        sq_norms = np.sum(Xn * Xn, axis=1, keepdims=True)
        num_D = sq_norms + sq_norms.T - 2.0 * (Xn @ Xn.T)
        num_D[num_D < 0] = 0.0
    else:
        num_D = np.zeros((m, m), dtype=float)

    if cat_idx_arr.size > 0:
        Xc = X[:, cat_idx_arr]
        cat_D = np.zeros((m, m), dtype=float)
        for j in range(Xc.shape[1]):
            col = Xc[:, j]
            eq = (col.reshape(-1, 1) == col.reshape(1, -1))
            cat_D += (~eq).astype(float)
        cat_D *= float(gamma)
    else:
        cat_D = np.zeros((m, m), dtype=float)

    return num_D + cat_D

def _mode_series(values):
    cnt = Counter(values)
    return cnt.most_common(1)[0][0] if values else None

def _prototype_for_cluster(df_like: pd.DataFrame, labels: np.ndarray, cid: int,
                           cat_cols: List[str], num_cols: List[str]) -> pd.Series:
    sub = df_like.loc[labels == cid]
    proto = {}
    for c in num_cols:
        proto[c] = float(sub[c].astype(float).mean()) if c in sub.columns else np.nan
    for c in cat_cols:
        proto[c] = _mode_series(sub[c].tolist()) if c in sub.columns else None
    return pd.Series(proto)

def _mixed_dissim_point_to_proto(x_row: pd.Series, proto: pd.Series,
                                 cat_cols: List[str], num_cols: List[str], gamma: float) -> float:
    d_num = 0.0
    if num_cols:
        x_num = x_row[num_cols].astype(float).to_numpy()
        p_num = pd.to_numeric(proto[num_cols], errors="coerce").astype(float).to_numpy()
        diff = x_num - p_num
        d_num = float(np.nansum(diff * diff))
    d_cat = 0.0
    for c in cat_cols:
        d_cat += 0.0 if x_row.get(c, None) == proto.get(c, None) else 1.0
    return d_num + gamma * d_cat

def davies_bouldin_mixed(df_like: pd.DataFrame, labels: np.ndarray,
                         cat_cols: List[str], num_cols: List[str], gamma: float) -> float:
    labels = np.asarray(labels)
    clusters = np.unique(labels)
    k = clusters.size
    if k < 2:
        return np.nan

    protos = {c: _prototype_for_cluster(df_like, labels, c, cat_cols, num_cols) for c in clusters}

    S = {}
    for c in clusters:
        idxs = np.where(labels == c)[0]
        if idxs.size <= 1:
            S[c] = 0.0
            continue
        proto = protos[c]
        dists = [
            _mixed_dissim_point_to_proto(df_like.iloc[i], proto, cat_cols, num_cols, gamma)
            for i in idxs
        ]
        S[c] = float(np.mean(dists))

    def proto_proto_dist(pi: pd.Series, pj: pd.Series) -> float:
        d_num = 0.0
        if num_cols:
            ni = pd.to_numeric(pi[num_cols], errors="coerce").astype(float).to_numpy()
            nj = pd.to_numeric(pj[num_cols], errors="coerce").astype(float).to_numpy()
            diff = ni - nj
            d_num = float(np.nansum(diff * diff))
        d_cat = 0.0
        for c in cat_cols:
            d_cat += 0.0 if pi.get(c, None) == pj.get(c, None) else 1.0
        return d_num + gamma * d_cat

    R = []
    for i in clusters:
        Ri = -np.inf
        for j in clusters:
            if i == j:
                continue
            Mij = proto_proto_dist(protos[i], protos[j])
            if Mij == 0:
                continue
            Rij = (S[i] + S[j]) / Mij
            if Rij > Ri:
                Ri = Rij
        R.append(Ri if np.isfinite(Ri) else 0.0)

    return float(np.mean(R))

from sklearn.metrics import silhouette_score  # pastikan ini ada di atas

def silhouette_kprototypes_mixed(
    df_like: pd.DataFrame,
    labels: np.ndarray,
    cat_idx: List[int],
    gamma: float,
    max_samples: int = MAX_SILHOUETTE_SAMPLES,
    random_state: int = RANDOM_STATE_SIL
) -> float:
    
    labels = np.asarray(labels)
    n_samples = df_like.shape[0]

    if n_samples < 2:
        return np.nan

    rng = np.random.RandomState(random_state)

    if n_samples > max_samples:
        idx = np.sort(rng.choice(n_samples, size=max_samples, replace=False))
        X_sub = df_like.to_numpy()[idx]
        labels_sub = labels[idx]
    else:
        X_sub = df_like.to_numpy()
        labels_sub = labels

    clusters_sub = np.unique(labels_sub)
    if clusters_sub.size < 2:
        return np.nan

    # matriks dissimilarity mixed untuk sample yang dipakai
    D = _kproto_pairwise_dissim_matrix(X_sub, cat_idx, gamma)
    D = np.asarray(D, dtype=float)
    D[D < 0] = 0.0
    D = 0.5 * (D + D.T)
    np.fill_diagonal(D, 0.0)

    sil = silhouette_score(D, labels_sub, metric="precomputed")
    return float(sil)



In [None]:

PREPROCESS_OPTS = dict(
    preprocess=True,          
    scale_method="minmax",     
    clip_outliers=True,        
    q_low=0.01, q_high=0.99,   
    fill_num="median",         
    fill_cat_label="MISSING",  
    rare_thresh=0.01,          
    other_label="OTHER",
    force_upper=True,          
    strip_space=True           
)

# main
processed_any = False
errors = []

# Konfigurasi
FILE_PATH = r"D:/DATA SKRIPSI/kontrak_sewa_bersih.xlsx"  # file input
PERIOD_SHEETS = ["Monthly_Fixed", "Daily_Fixed"]  # nama sheet
CATEGORICAL_COLS = [
    "BusinessType","LeaseYearStart","LeaseMonthStart","LeaseDayStart",
    "LeaseYearEnd","LeaseMonthEnd","LeaseDayEnd","TranCode",
    "ContractPeriod","ContractType","Building","UnitArea","UnitFloor"
]
NUMERIC_COLS = ["BuildingArea","LeaseDurationDays","LeaseDurationMonths","n_subunit"]
K_RANGE = range(2, 7)
DROP_COLS = ["UnitNum"]

in_path = Path(FILE_PATH)
out_path = in_path.with_name(f"{in_path.stem}_clustered{in_path.suffix}")

if out_path.exists():
    print(f"File output sudah ada, menghapus: {out_path}")
    out_path.unlink()

with pd.ExcelWriter(out_path, engine="openpyxl") as ew:
    for SHEET_NAME in PERIOD_SHEETS:
        try:
            print(f"\n=== Proses periode: {SHEET_NAME} ===")
            df_raw_full = pd.read_excel(FILE_PATH, sheet_name=SHEET_NAME)

            if df_raw_full.shape[0] < 2:
                raise ValueError("Baris data < 2")

            df_raw = df_raw_full.drop(columns=["UnitID"], errors="ignore").copy()

            df_raw = df_raw.drop(columns=[c for c in DROP_COLS if c in df_raw.columns],
                                errors="ignore")

            for col in NUMERIC_COLS:
                if col in df_raw.columns:
                    df_raw[col] = pd.to_numeric(df_raw[col], errors="coerce").astype(float)

            CATEG_COLS_USED = [c for c in CATEGORICAL_COLS if c in df_raw.columns]
            NUMERIC_COLS_USED = [c for c in NUMERIC_COLS if c in df_raw.columns]

            df_prep, cat_idx, cat_cols, num_cols = prepare_for_kprototypes(
                df_raw,
                categorical_cols=CATEG_COLS_USED,
                numeric_cols=NUMERIC_COLS_USED,
                force_cat_as_str=True,
                **PREPROCESS_OPTS
            )

            gamma = auto_gamma(df_prep, num_cols)
            X = df_prep.to_numpy()

            elbow_rows = []
            for k in K_RANGE:
                mk = KPrototypes(
                    n_clusters=int(k),
                    init="Cao",
                    n_init=10,
                    max_iter=100,
                    random_state=42,
                    gamma=gamma
                )
                mk.fit_predict(X, categorical=cat_idx)
                elbow_rows.append({"k": int(k), "cost": float(mk.cost_), "n_iter": mk.n_iter_})

            elbow_tbl = pd.DataFrame(elbow_rows).sort_values("k")
            FINAL_K = choose_k_by_elbow(elbow_tbl["k"].to_numpy(),
                                        elbow_tbl["cost"].to_numpy())

            # model final
            model_final = KPrototypes(
                n_clusters=FINAL_K,
                init="Cao",
                n_init=15,
                max_iter=100,
                random_state=42,
                gamma=gamma
            )
            labels = model_final.fit_predict(X, categorical=cat_idx)

            # silhouette dan Davies-Bouldin
            try:
                sil_score = silhouette_kprototypes_mixed(
                    df_prep,
                    labels,
                    cat_idx=cat_idx,
                    gamma=gamma,
                    max_samples=MAX_SILHOUETTE_SAMPLES,
                    random_state=RANDOM_STATE_SIL
                )
            except Exception as e:
                print(f"[WARN] Gagal hitung silhouette di sheet {SHEET_NAME}: {type(e).__name__} - {e}")
                sil_score = np.nan

            try:
                dbi_mixed = davies_bouldin_mixed(
                    df_prep,
                    labels,
                    cat_cols=cat_cols,
                    num_cols=num_cols,
                    gamma=gamma
                )
            except Exception as e:
                print(f"[WARN] Gagal hitung DBI di sheet {SHEET_NAME}: {type(e).__name__} - {e}")
                dbi_mixed = np.nan


            # output
            df_out = df_raw_full.copy()   
            df_out["cluster"] = labels    

            df_out.to_excel(ew, index=False, sheet_name=f"{SHEET_NAME}_clustered")

            # metrics + Silhouette & DaviesBouldin
            pd.DataFrame({
                "Sheet": [SHEET_NAME],
                "FINAL_K": [FINAL_K],
                "Gamma": [gamma],
                "Rows": [df_prep.shape[0]],
                "Silhouette": [sil_score],
                "DaviesBouldin": [dbi_mixed],
            }).to_excel(ew, index=False, sheet_name=f"{SHEET_NAME}_metrics")

        except Exception as e:
            msg = f"{SHEET_NAME}: {type(e).__name__} - {e}"
            print("[ERROR]", msg)
            errors.append({"sheet": SHEET_NAME, "error": msg})


File output sudah ada, menghapus: D:\DATA SKRIPSI\kontrak_sewa_bersih_clustered.xlsx

=== Proses periode: Monthly_Fixed ===
Iterasi K dan Cost:
  k = 2, cost = 94559134698437.77
  k = 3, cost = 61845414922512.55
  k = 4, cost = 21833507347716.35
  k = 5, cost = 14122752653322.16
  k = 6, cost = 9275087068763.059
-----------------------------------
Elbow ditemukan pada k = 4 dengan jarak = 0.24942906269063833

=== Proses periode: Daily_Fixed ===
Iterasi K dan Cost:
  k = 2, cost = 304409666998.77313
  k = 3, cost = 172198676837.30658
  k = 4, cost = 80522341554.1603
  k = 5, cost = 57358960933.33165
  k = 6, cost = 39585334111.49639
-----------------------------------
Elbow ditemukan pada k = 4 dengan jarak = 0.24424759059880363
