In [2]:
import os
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve, auc

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv("framingham.csv")


print("Ukuran dataset:", df.shape)
print("\nJumlah missing value per kolom:\n", df.isnull().sum())


print("\nInfo dataset:")
print(df.info())

if "TenYearCHD" in df.columns:
    print("\nDistribusi label target (TenYearCHD):")
    print(df["TenYearCHD"].value_counts(normalize=True))


Ukuran dataset: (4240, 16)

Jumlah missing value per kolom:
 male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     

In [4]:
from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

imputer_num = SimpleImputer(strategy="median")
df[num_cols] = imputer_num.fit_transform(df[num_cols])

if len(cat_cols) > 0:
    imputer_cat = SimpleImputer(strategy="most_frequent")
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

print("Jumlah missing value setelah imputasi:\n", df.isnull().sum().sum())

Jumlah missing value setelah imputasi:
 0


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop("TenYearCHD", axis=1)
y = df["TenYearCHD"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, stratify=y, random_state=42
)

print("Distribusi kelas di train:", np.bincount(y_train))
print("Distribusi kelas di test :", np.bincount(y_test))


Distribusi kelas di train: [2517  451]
Distribusi kelas di test : [1079  193]


In [6]:
# Cell A: cek distribusi label dan baseline majority
import numpy as np
from collections import Counter
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

print("Distribusi y_train:", Counter(y_train))
print("Distribusi y_test:", Counter(y_test))

# Baseline majority
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
y_dummy = dummy.predict(X_test)
print("\nBaseline (most_frequent) Accuracy:", accuracy_score(y_test, y_dummy))
print("Baseline (most_frequent) Classification Report:\n", classification_report(y_test, y_dummy, digits=3))


Distribusi y_train: Counter({0.0: 2517, 1.0: 451})
Distribusi y_test: Counter({0.0: 1079, 1.0: 193})

Baseline (most_frequent) Accuracy: 0.8482704402515723
Baseline (most_frequent) Classification Report:
               precision    recall  f1-score   support

         0.0      0.848     1.000     0.918      1079
         1.0      0.000     0.000     0.000       193

    accuracy                          0.848      1272
   macro avg      0.424     0.500     0.459      1272
weighted avg      0.720     0.848     0.779      1272



In [7]:
# Cell B: scaler yang benar (StandardScaler contoh)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # FIT hanya di train
X_test_scaled  = scaler.transform(X_test)        # TRANSFORM di test

# Cek shape
print(X_train.shape, '->', X_train_scaled.shape)
print(X_test.shape, '->', X_test_scaled.shape)

(2968, 15) -> (2968, 15)
(1272, 15) -> (1272, 15)


In [8]:
# Cell C: SVM tanpa class_weight (mungkin mirip dengan baseline dosen)
from sklearn.svm import SVC

svm_plain = SVC(kernel='linear', C=1.0, probability=True, random_state=42)  # gamma dihapus karena linear
svm_plain.fit(X_train_scaled, y_train)
y_pred_plain = svm_plain.predict(X_test_scaled)

print("SVM (no class_weight) Classification Report:\n", classification_report(y_test, y_pred_plain, digits=3))
print("Accuracy:", accuracy_score(y_test, y_pred_plain))
print("F1-score (pos=1):", f1_score(y_test, y_pred_plain, pos_label=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_plain))



SVM (no class_weight) Classification Report:
               precision    recall  f1-score   support

         0.0      0.848     1.000     0.918      1079
         1.0      0.000     0.000     0.000       193

    accuracy                          0.848      1272
   macro avg      0.424     0.500     0.459      1272
weighted avg      0.720     0.848     0.779      1272

Accuracy: 0.8482704402515723
F1-score (pos=1): 0.0
Confusion Matrix:
 [[1079    0]
 [ 193    0]]


In [10]:
# SMOTE + SVM
import numpy as np
from sklearn.metrics import classification_report, f1_score

# Terapkan SMOTE pada data training
smote = SMOTE(sampling_strategy=0.5, random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Jumlah setelah SMOTE:", X_resampled.shape, np.bincount(y_resampled))

# SVM dengan parameter yang lebih optimal
svm_smote = SVC(kernel='linear', C=0.5, random_state=42)  # Lebih simpel dan cepat
svm_smote.fit(X_resampled, y_resampled)
y_pred_smote = svm_smote.predict(X_test_scaled)

print("\n=== HASIL SVM + SMOTE ===")
print("Classification Report:\n", classification_report(y_test, y_pred_smote, digits=3))
print("F1-Score:", f1_score(y_test, y_pred_smote))

# Bandingkan dengan baseline SVM
print("\n=== PERBANDINGAN ===")
print("F1-Score Baseline SVM:", f1_score(y_test, y_pred_plain))
print("F1-Score SVM + SMOTE:", f1_score(y_test, y_pred_smote))
print("Improvement:", f1_score(y_test, y_pred_smote) - f1_score(y_test, y_pred_plain))

Jumlah setelah SMOTE: (3775, 15) [2517 1258]

=== HASIL SVM + SMOTE ===
Classification Report:
               precision    recall  f1-score   support

         0.0      0.868     0.930     0.898      1079
         1.0      0.350     0.212     0.265       193

    accuracy                          0.821      1272
   macro avg      0.609     0.571     0.581      1272
weighted avg      0.790     0.821     0.802      1272

F1-Score: 0.2645161290322581

=== PERBANDINGAN ===
F1-Score Baseline SVM: 0.0
F1-Score SVM + SMOTE: 0.2645161290322581
Improvement: 0.2645161290322581

=== HASIL SVM + SMOTE ===
Classification Report:
               precision    recall  f1-score   support

         0.0      0.868     0.930     0.898      1079
         1.0      0.350     0.212     0.265       193

    accuracy                          0.821      1272
   macro avg      0.609     0.571     0.581      1272
weighted avg      0.790     0.821     0.802      1272

F1-Score: 0.2645161290322581

=== PERBANDINGAN =

In [11]:
from sklearn.base import BaseEstimator
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

class SMOTE_IPF(BaseEstimator):
    """
    SMOTE-IPF: SMOTE dengan Iterative Partitioning Filter
    Optimized version untuk skripsi
    """
    
    def __init__(self, smote_k=5, ipf_k=5, max_iter=10, sampling_strategy=0.5,
                 random_state=None, verbose=False, remove_only_synthetic=True):
        # Parameter validation
        if smote_k < 1 or ipf_k < 1:
            raise ValueError("k values must be >= 1")
        if max_iter < 1:
            raise ValueError("max_iter must be >= 1")
            
        self.smote_k = smote_k
        self.ipf_k = min(ipf_k, smote_k)  # IPF k tidak boleh > SMOTE k
        self.max_iter = max_iter
        self.sampling_strategy = sampling_strategy
        self.random_state = random_state
        self.verbose = verbose
        self.remove_only_synthetic = remove_only_synthetic
        
    def fit_resample(self, X, y):
        """
        Fit dan resample data menggunakan SMOTE-IPF
        """
        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y).astype(int).ravel()
        
        if len(np.unique(y)) < 2:
            raise ValueError("Need at least 2 classes")
            
        n_orig = X.shape[0]
        
        # Step 1: Apply SMOTE
        smote = SMOTE(
            k_neighbors=self.smote_k,
            sampling_strategy=self.sampling_strategy,
            random_state=self.random_state
        )
        
        X_res, y_res = smote.fit_resample(X, y)
        
        # Step 2: Track synthetic samples
        n_after_smote = X_res.shape[0]
        n_synthetic = n_after_smote - n_orig
        
        # Boolean mask untuk synthetic samples
        is_synthetic = np.zeros(n_after_smote, dtype=bool)
        if n_synthetic > 0:
            is_synthetic[n_orig:] = True
            
        if self.verbose:
            unique, counts = np.unique(y_res, return_counts=True)
            print(f"[SMOTE] Total: {n_after_smote}, Synthetic: {n_synthetic}")
            print(f"[SMOTE] Class distribution: {dict(zip(unique, counts))}")
        
        # Step 3: Iterative Partitioning Filter
        X_current = X_res.copy()
        y_current = y_res.copy()
        synthetic_current = is_synthetic.copy()
        
        for iteration in range(self.max_iter):
            if len(X_current) == 0:
                break
                
            # Train KNN classifier
            n_neighbors = min(self.ipf_k, len(X_current) - 1)
            if n_neighbors < 1:
                break
                
            clf = KNeighborsClassifier(n_neighbors=n_neighbors)
            clf.fit(X_current, y_current)
            y_pred = clf.predict(X_current)
            
            # Find misclassified samples
            misclassified = (y_pred != y_current)
            n_misclassified = misclassified.sum()
            
            if n_misclassified == 0:
                if self.verbose:
                    print(f"[IPF] Iter {iteration+1}: No misclassified -> STOP")
                break
            
            # Determine which samples to remove
            if self.remove_only_synthetic:
                # Only remove synthetic misclassified samples
                to_remove = misclassified & synthetic_current
                n_removed = to_remove.sum()
                
                if n_removed == 0:
                    if self.verbose:
                        print(f"[IPF] Iter {iteration+1}: Only original misclassified -> STOP")
                    break
            else:
                # Remove all misclassified samples
                to_remove = misclassified
                n_removed = to_remove.sum()
            
            # Update arrays
            keep_mask = ~to_remove
            X_current = X_current[keep_mask]
            y_current = y_current[keep_mask]
            synthetic_current = synthetic_current[keep_mask]
            
            if self.verbose:
                unique, counts = np.unique(y_current, return_counts=True)
                print(f"[IPF] Iter {iteration+1}: Removed {n_removed}/{n_misclassified}, "
                      f"Remaining: {len(X_current)}, "
                      f"Distribution: {dict(zip(unique, counts))}")
        
        if self.verbose:
            unique, counts = np.unique(y_current, return_counts=True)
            synthetic_remaining = synthetic_current.sum()
            print(f"[FINAL] Total: {len(X_current)}, Synthetic remaining: {synthetic_remaining}")
            print(f"[FINAL] Class distribution: {dict(zip(unique, counts))}")
        
        return X_current, y_current

In [12]:
# Test SMOTE-IPF yang sudah dioptimalkan
print("=== TESTING SMOTE-IPF ===")

# Inisialisasi SMOTE-IPF dengan parameter optimal
smote_ipf = SMOTE_IPF(
    smote_k=5, 
    ipf_k=3, 
    max_iter=10, 
    sampling_strategy=0.5,
    random_state=42, 
    verbose=True,
    remove_only_synthetic=True
)

# Apply SMOTE-IPF
X_ipf, y_ipf = smote_ipf.fit_resample(X_train_scaled, y_train)

print(f"\nDistribusi akhir SMOTE-IPF: {np.bincount(y_ipf)}")

# Train SVM dengan data SMOTE-IPF
svm_ipf = SVC(kernel='linear', C=1.0, random_state=42)
svm_ipf.fit(X_ipf, y_ipf)
y_pred_ipf = svm_ipf.predict(X_test_scaled)

print("\n=== HASIL SVM + SMOTE-IPF ===")
print("Classification Report:\n", classification_report(y_test, y_pred_ipf, digits=3))
print("F1-Score:", f1_score(y_test, y_pred_ipf))

# Perbandingan semua metode
print("\n=== PERBANDINGAN LENGKAP ===")
print("F1-Score Baseline SVM  :", f1_score(y_test, y_pred_plain))
print("F1-Score SVM + SMOTE   :", f1_score(y_test, y_pred_smote))
print("F1-Score SVM + SMOTE-IPF:", f1_score(y_test, y_pred_ipf))
print("Improvement SMOTE-IPF vs SMOTE:", f1_score(y_test, y_pred_ipf) - f1_score(y_test, y_pred_smote))

=== TESTING SMOTE-IPF ===
[SMOTE] Total: 3775, Synthetic: 807
[SMOTE] Class distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(1258)}
[IPF] Iter 1: Removed 1/352, Remaining: 3774, Distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(1257)}
[IPF] Iter 2: Only original misclassified -> STOP
[FINAL] Total: 3774, Synthetic remaining: 806
[FINAL] Class distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(1257)}

Distribusi akhir SMOTE-IPF: [2517 1257]

=== HASIL SVM + SMOTE-IPF ===
Classification Report:
               precision    recall  f1-score   support

         0.0      0.870     0.927     0.897      1079
         1.0      0.352     0.223     0.273       193

    accuracy                          0.820      1272
   macro avg      0.611     0.575     0.585      1272
weighted avg      0.791     0.820     0.803      1272

F1-Score: 0.273015873015873

=== PERBANDINGAN LENGKAP ===
F1-Score Baseline SVM  : 0.0
F1-Score SVM + SMOTE   : 0.264516129032

In [15]:
print("=== EKSPERIMEN MENGURANGI BIAS ===")

# 1. Coba sampling_strategy lebih agresif (lebih balanced)
smote_ipf_balanced = SMOTE_IPF(
    smote_k=5, 
    ipf_k=3, 
    max_iter=10, 
    sampling_strategy=0.8,  # Lebih agresif
    random_state=42, 
    verbose=True
)

X_ipf_bal, y_ipf_bal = smote_ipf_balanced.fit_resample(X_train_scaled, y_train)

# 2. SVM dengan class_weight balanced
svm_balanced = SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=42)
svm_balanced.fit(X_ipf_bal, y_ipf_bal)
y_pred_balanced = svm_balanced.predict(X_test_scaled)

print("\n=== HASIL SVM + SMOTE-IPF + CLASS_WEIGHT ===")
print("Classification Report:\n", classification_report(y_test, y_pred_balanced, digits=3))
print("F1-Score:", f1_score(y_test, y_pred_balanced))

# 3. Coba decision threshold tuning dengan probability
svm_prob = SVC(kernel='linear', C=1.0, probability=True, random_state=42)
svm_prob.fit(X_ipf, y_ipf)
y_prob = svm_prob.predict_proba(X_test_scaled)[:, 1]

# Tuning threshold untuk balance precision-recall
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
print("\n=== THRESHOLD TUNING ===")
for thresh in thresholds:
    y_pred_thresh = (y_prob >= thresh).astype(int)
    f1 = f1_score(y_test, y_pred_thresh)
    recall_1 = (y_pred_thresh[y_test == 1] == 1).sum() / (y_test == 1).sum()
    precision_1 = (y_test[y_pred_thresh == 1] == 1).sum() / max(1, (y_pred_thresh == 1).sum())
    print(f"Threshold {thresh}: F1={f1:.3f}, Recall={recall_1:.3f}, Precision={precision_1:.3f}")

# Hitung semua recall terlebih dahulu
recall_0_plain = (y_pred_plain[y_test == 0] == 0).sum() / (y_test == 0).sum() if (y_test == 0).sum() > 0 else 0
recall_1_plain = (y_pred_plain[y_test == 1] == 1).sum() / (y_test == 1).sum() if (y_test == 1).sum() > 0 else 0

recall_0_smote = (y_pred_smote[y_test == 0] == 0).sum() / (y_test == 0).sum()
recall_1_smote = (y_pred_smote[y_test == 1] == 1).sum() / (y_test == 1).sum()

recall_0_ipf = (y_pred_ipf[y_test == 0] == 0).sum() / (y_test == 0).sum()
recall_1_ipf = (y_pred_ipf[y_test == 1] == 1).sum() / (y_test == 1).sum()

recall_0_bal = (y_pred_balanced[y_test == 0] == 0).sum() / (y_test == 0).sum()
recall_1_bal = (y_pred_balanced[y_test == 1] == 1).sum() / (y_test == 1).sum()

# Fungsi untuk menentukan status bias
def get_bias_status(recall_0, recall_1):
    if recall_1 == 0:
        return "Ekstrem"
    diff = abs(recall_0 - recall_1)
    if diff > 0.3:
        return "Tinggi"
    elif diff > 0.1:
        return "Sedang"
    else:
        return "Rendah"

bias_plain = get_bias_status(recall_0_plain, recall_1_plain)
bias_smote = get_bias_status(recall_0_smote, recall_1_smote)
bias_ipf = get_bias_status(recall_0_ipf, recall_1_ipf)
bias_bal = get_bias_status(recall_0_bal, recall_1_bal)

# Perbandingan akhir LENGKAP
print("\n=== PERBANDINGAN BIAS (LENGKAP) ===")
print("Method                    | F1-Score | Recall-0 | Recall-1 | Bias")
print("-" * 68)
print(f"Baseline SVM              | {f1_score(y_test, y_pred_plain):.3f}    | {recall_0_plain:.3f}    | {recall_1_plain:.3f}    | {bias_plain}")
print(f"SVM + SMOTE               | {f1_score(y_test, y_pred_smote):.3f}    | {recall_0_smote:.3f}    | {recall_1_smote:.3f}    | {bias_smote}")
print(f"SVM + SMOTE-IPF           | {f1_score(y_test, y_pred_ipf):.3f}    | {recall_0_ipf:.3f}    | {recall_1_ipf:.3f}    | {bias_ipf}")
print(f"SVM + SMOTE-IPF + Balanced| {f1_score(y_test, y_pred_balanced):.3f}    | {recall_0_bal:.3f}    | {recall_1_bal:.3f}    | {bias_bal}")

# Analisis improvement bias
print(f"\n=== ANALISIS BIAS REDUCTION ===")
print(f"Baseline: Recall gap = {abs(recall_0_plain - recall_1_plain):.3f} ({bias_plain} bias)")
print(f"SMOTE: Recall gap = {abs(recall_0_smote - recall_1_smote):.3f} ({bias_smote} bias)")
print(f"SMOTE-IPF: Recall gap = {abs(recall_0_ipf - recall_1_ipf):.3f} ({bias_ipf} bias)")
print(f"Balanced: Recall gap = {abs(recall_0_bal - recall_1_bal):.3f} ({bias_bal} bias)")

print(f"\nðŸŽ¯ KESIMPULAN:")
print(f"â€¢ Baseline: F1={f1_score(y_test, y_pred_plain):.3f}, Miss {100*(1-recall_1_plain):.1f}% kasus CHD")
print(f"â€¢ SMOTE: F1={f1_score(y_test, y_pred_smote):.3f}, Miss {100*(1-recall_1_smote):.1f}% kasus CHD")
print(f"â€¢ SMOTE-IPF: F1={f1_score(y_test, y_pred_ipf):.3f}, Miss {100*(1-recall_1_ipf):.1f}% kasus CHD")
print(f"â€¢ Balanced: F1={f1_score(y_test, y_pred_balanced):.3f}, Miss {100*(1-recall_1_bal):.1f}% kasus CHD âœ…")

print(f"\nðŸ“Š IMPROVEMENT:")
print(f"â€¢ SMOTE-IPF vs SMOTE: +{(f1_score(y_test, y_pred_ipf) - f1_score(y_test, y_pred_smote)):.3f} F1-score")
print(f"â€¢ Balanced vs SMOTE-IPF: +{(f1_score(y_test, y_pred_balanced) - f1_score(y_test, y_pred_ipf)):.3f} F1-score")
print(f"â€¢ Total improvement: +{(f1_score(y_test, y_pred_balanced) - f1_score(y_test, y_pred_plain)):.3f} F1-score")

=== EKSPERIMEN MENGURANGI BIAS ===
[SMOTE] Total: 4530, Synthetic: 1562
[SMOTE] Class distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(2013)}
[IPF] Iter 1: Removed 3/407, Remaining: 4527, Distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(2010)}
[IPF] Iter 2: Only original misclassified -> STOP
[FINAL] Total: 4527, Synthetic remaining: 1559
[FINAL] Class distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(2010)}
[IPF] Iter 2: Only original misclassified -> STOP
[FINAL] Total: 4527, Synthetic remaining: 1559
[FINAL] Class distribution: {np.int64(0): np.int64(2517), np.int64(1): np.int64(2010)}

=== HASIL SVM + SMOTE-IPF + CLASS_WEIGHT ===
Classification Report:
               precision    recall  f1-score   support

         0.0      0.909     0.641     0.752      1079
         1.0      0.243     0.642     0.352       193

    accuracy                          0.642      1272
   macro avg      0.576     0.642     0.552      1272
weighted avg