In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import time
import math
from tqdm.auto import tqdm

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)

In [3]:
X_train, X_test, y_train, y_test = joblib.load("data/train_test_split.joblib")
preprocessor = joblib.load("models/preprocessor.joblib")

In [4]:
print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)
print("y_train distribusi:")
print((y_train.value_counts(normalize=True) * 100).round(2))
print("\ny_test distribusi:")
print((y_test.value_counts(normalize=True) * 100).round(2))

X_train shape: (1163451, 18)
X_test shape : (290863, 18)
y_train distribusi:
Arrest
False    74.09
True     25.91
Name: proportion, dtype: float64

y_test distribusi:
Arrest
False    74.09
True     25.91
Name: proportion, dtype: float64


### Definisikan model-model yang akan dipakai (baseline)

In [5]:
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        n_jobs=-1
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        random_state=42,
        n_estimators=200
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=200,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",  # supaya ga warning di versi baru
        n_jobs=-1
    ),
    "KNN": KNeighborsClassifier(
        n_neighbors=5
    ),
    "DecisionTree": DecisionTreeClassifier(
        random_state=42
    )
}

print("Model yang akan diuji:", list(models.keys()))

Model yang akan diuji: ['LogisticRegression', 'RandomForest', 'LightGBM', 'XGBoost', 'KNN', 'DecisionTree']


### Definisikan strategi sampling

In [6]:
sampling_strategies = {
    "none": None,                    # tanpa sampling
    "smote": SMOTE(random_state=42),
    "smoteenn": SMOTEENN(random_state=42)
}

print("Strategi sampling:", list(sampling_strategies.keys()))

Strategi sampling: ['none', 'smote', 'smoteenn']


### Function: evaluasi & simpan metrik

In [7]:
def evaluate_classification(y_true, y_pred, y_proba=None, positive_label=1):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=positive_label)
    rec = recall_score(y_true, y_pred, pos_label=positive_label)
    f1 = f1_score(y_true, y_pred, pos_label=positive_label)

    auc = None
    if y_proba is not None:
        try:
            auc = roc_auc_score(y_true, y_proba)
        except ValueError:
            auc = None

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc
    }

### Loop utama: Preprocessing + Sampling + Model dalam 1 pipeline

In [10]:
# Proporsi train yang mau dipakai (misalnya 0.3 = 30% dari X_train)
train_frac = 0.3

# Pastikan tidak lebih besar dari 1.0
train_frac = min(train_frac, 1.0)

# Hanya kalau memang mau dikurangi (kalau 1.0 berarti pakai seluruh training)
if train_frac < 1.0:
    X_train_small, _, y_train_small, _ = train_test_split(
        X_train,
        y_train,
        train_size=train_frac,
        stratify=y_train,
        random_state=42
    )

    print("=== Stratified subsampling dilakukan ===")
    print("Ukuran X_train awal :", X_train.shape[0])
    print("Ukuran X_train_small:", X_train_small.shape[0])
    print("\nDistribusi Arrest di y_train_small:")
    print((y_train_small.value_counts(normalize=True) * 100).round(2))

    # Kita akan pakai X_train_small & y_train_small untuk training
    X_train_used = X_train_small
    y_train_used = y_train_small
else:
    print("train_frac = 1.0, pakai seluruh X_train")
    X_train_used = X_train
    y_train_used = y_train

=== Stratified subsampling dilakukan ===
Ukuran X_train awal : 1163451
Ukuran X_train_small: 349035

Distribusi Arrest di y_train_small:
Arrest
False    74.09
True     25.91
Name: proportion, dtype: float64


In [None]:
results = []

positive_label = True if y_train.dtype == 'bool' else 1

# Bungkus loop luar/inner dengan tqdm
sampling_items = list(sampling_strategies.items())
model_items = list(models.items())

for samp_name, sampler in tqdm(sampling_items, desc="Sampling strategies"):
    for model_name, model in tqdm(model_items, desc=f"Models ({samp_name})", leave=False):
        print(f"\nTraining model: {model_name} | Sampling: {samp_name}")
        
        start_time = time.time()
        
        if sampler is None:
            clf = ImbPipeline(steps=[
                ('preprocess', preprocessor),
                ('model', model)
            ])
        else:
            clf = ImbPipeline(steps=[
                ('preprocess', preprocessor),
                ('sampling', sampler),
                ('model', model)
            ])
        
        try:
            clf.fit(X_train_used, y_train_used)
            y_pred = clf.predict(X_test)
            
            y_proba = None
            if hasattr(clf.named_steps['model'], "predict_proba"):
                y_proba = clf.predict_proba(X_test)[:, 1]
            
            metrics = evaluate_classification(
                y_test, y_pred, y_proba, positive_label=positive_label
            )
            
            elapsed = time.time() - start_time
            
            print(f"  -> F1={metrics['f1']:.4f} | "
                  f"Recall={metrics['recall']:.4f} | "
                  f"AUC={metrics['auc']:.4f}" if metrics['auc'] is not None else
                  f"  -> F1={metrics['f1']:.4f} | "
                  f"Recall={metrics['recall']:.4f} | AUC=-")
            
            result_row = {
                "model": model_name,
                "sampling": samp_name,
                "train_time_sec": elapsed,
                **metrics
            }
            results.append(result_row)
        
        except Exception as e:
            elapsed = time.time() - start_time
            print(f"  !! ERROR pada {model_name} dengan {samp_name}: {e}")
            result_row = {
                "model": model_name,
                "sampling": samp_name,
                "train_time_sec": elapsed,
                "accuracy": np.nan,
                "precision": np.nan,
                "recall": np.nan,
                "f1": np.nan,
                "auc": np.nan,
                "error": str(e)
            }
            results.append(result_row)

results_df = pd.DataFrame(results)
results_df


=== Training model: LogisticRegression | Sampling: none ===

=== Training model: RandomForest | Sampling: none ===


KeyboardInterrupt: 

In [None]:
results_df = pd.DataFrame(results)
results_df