In [12]:
# 1. Import library
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer


RANDOM_STATE = 42
pd.set_option("display.max_columns", None)

In [13]:
df = pd.read_csv("framingham.csv")

imputer = SimpleImputer(strategy="median")
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

class_dist = df['TenYearCHD'].value_counts()
print("Class distribution:")
print(f"0 (No CHD): {class_dist[0]} ({class_dist[0]/len(df)*100:.1f}%)")
print(f"1 (CHD)   : {class_dist[1]} ({class_dist[1]/len(df)*100:.1f}%)")
print(f"Imbalance ratio: {class_dist[0]/class_dist[1]:.1f}:1")

Class distribution:
0 (No CHD): 3596 (84.8%)
1 (CHD)   : 644 (15.2%)
Imbalance ratio: 5.6:1


In [14]:
# ================== PILIH 8 FITUR UTAMA ==================
feature_columns = [col for col in df.columns if col != 'TenYearCHD']
correlations = (
    df.corr()['TenYearCHD']
      .drop('TenYearCHD')
      .abs()
      .sort_values(ascending=False)
)

selected_features = list(correlations.head(8).index)

print("\nSelected features (Top 8 korelasi):")
for i, (feat, corr) in enumerate(correlations.head(8).items(), 1):
    print(f"{i}. {feat}: {corr:.3f}")

X = df[selected_features]
y = df['TenYearCHD'].astype(int)


Selected features (Top 8 korelasi):
1. age: 0.225
2. sysBP: 0.216
3. prevalentHyp: 0.177
4. diaBP: 0.145
5. glucose: 0.121
6. diabetes: 0.097
7. male: 0.088
8. BPMeds: 0.086


In [15]:
# ================== SPLIT + SCALING ==================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nTrain shape:", X_train_scaled.shape)
print("Test shape :", X_test_scaled.shape)


Train shape: (3392, 8)
Test shape : (848, 8)


In [16]:
# ================== SMOTE-IPF ==================
class SMOTE_IPF(BaseEstimator):
    def __init__(self, sampling_strategy=1.0, smote_k=5, ipf_k=3,
                 max_iter=10, random_state=None, verbose=False):
        self.sampling_strategy = sampling_strategy
        self.smote_k = smote_k
        self.ipf_k = ipf_k
        self.max_iter = max_iter
        self.random_state = random_state
        self.verbose = verbose
    
    def fit_resample(self, X, y):
        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y).ravel()

        if not np.all(np.equal(np.mod(y, 1), 0)):
            raise ValueError(
                f"y harus label integer (misal 0 dan 1), "
                f"ditemukan nilai: {np.unique(y)}"
            )
        y = y.astype(int)

        smote = SMOTE(
            sampling_strategy=self.sampling_strategy,
            k_neighbors=self.smote_k,
            random_state=self.random_state
        )
        X_smote, y_smote = smote.fit_resample(X, y)
        
        if self.verbose:
            counts_after = np.bincount(y_smote)
            print(f"After SMOTE: {X_smote.shape[0]} samples, dist: {counts_after}")
        
        n_original = X.shape[0]
        is_synthetic = np.zeros(len(X_smote), dtype=bool)
        is_synthetic[n_original:] = True
        
        X_current = X_smote.copy()
        y_current = y_smote.copy()
        synthetic_mask = is_synthetic.copy()
        
        for iteration in range(self.max_iter):
            knn = KNeighborsClassifier(
                n_neighbors=min(self.ipf_k, len(X_current) - 1)
            )
            knn.fit(X_current, y_current)
            y_pred = knn.predict(X_current)
            
            misclassified = (y_pred != y_current)
            to_remove = misclassified & synthetic_mask
            
            if to_remove.sum() == 0:
                if self.verbose:
                    print(f"IPF converged at iter {iteration+1}")
                break
            
            keep_mask = ~to_remove
            X_current = X_current[keep_mask]
            y_current = y_current[keep_mask]
            synthetic_mask = synthetic_mask[keep_mask]
            
            if self.verbose:
                print(f"IPF iter {iteration+1}: removed {to_remove.sum()} samples")
        
        if self.verbose:
            final_counts = np.bincount(y_current)
            print(f"Final: {X_current.shape[0]} samples, dist: {final_counts}")
        
        return X_current, y_current

print("\nSMOTE-IPF class ready")



SMOTE-IPF class ready


In [20]:
# ================== EVALUASI ==================
results = {}

def evaluate_model(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    f1 = f1_score(y_true, y_pred, zero_division=0)

    recall_0 = tn / (tn + fp)
    recall_1 = tp / (tp + fn)
    bias_gap = abs(recall_0 - recall_1)

    results[name] = {
        "f1": f1,
        "recall_0": recall_0,
        "recall_1": recall_1,
        "bias_gap": bias_gap
    }

    print(f"\n=========== {name} ===========")
    print(f"F1-score      : {f1:.3f}")
    print(f"Recall kelas 0: {recall_0:.3f}")
    print(f"Recall kelas 1: {recall_1:.3f}")
    print(f"Bias gap      : {bias_gap:.3f}")

    print("\nConfusion Matrix (lengkap):")
    print(cm)

    print(f"\nDetail:")
    print(f"TN = {tn}, FP = {fp}, FN = {fn}, TP = {tp}")

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))


In [21]:
# ================== 1) BASELINE SVM ==================
svm_baseline = SVC(
    kernel='linear',
    C=1.0,
    random_state=RANDOM_STATE
)
svm_baseline.fit(X_train_scaled, y_train)
y_pred_baseline = svm_baseline.predict(X_test_scaled)

evaluate_model("Baseline SVM", y_test, y_pred_baseline)


F1-score      : 0.000
Recall kelas 0: 1.000
Recall kelas 1: 0.000
Bias gap      : 1.000

Confusion Matrix (lengkap):
[[719   0]
 [129   0]]

Detail:
TN = 719, FP = 0, FN = 129, TP = 0

Classification Report:
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       719
           1       0.00      0.00      0.00       129

    accuracy                           0.85       848
   macro avg       0.42      0.50      0.46       848
weighted avg       0.72      0.85      0.78       848



In [22]:
# ================== 2) SVM + SMOTE ==================
smote = SMOTE(
    sampling_strategy=0.5,
    random_state=RANDOM_STATE
)
X_smote, y_smote = smote.fit_resample(X_train_scaled, y_train)

svm_smote = SVC(
    kernel='linear',
    C=1.0,
    class_weight='balanced',
    random_state=RANDOM_STATE
)
svm_smote.fit(X_smote, y_smote)
y_pred_smote = svm_smote.predict(X_test_scaled)

evaluate_model("SVM + SMOTE", y_test, y_pred_smote)


F1-score      : 0.343
Recall kelas 0: 0.636
Recall kelas 1: 0.628
Bias gap      : 0.008

Confusion Matrix (lengkap):
[[457 262]
 [ 48  81]]

Detail:
TN = 457, FP = 262, FN = 48, TP = 81

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.64      0.75       719
           1       0.24      0.63      0.34       129

    accuracy                           0.63       848
   macro avg       0.57      0.63      0.54       848
weighted avg       0.80      0.63      0.69       848



In [23]:
# ================== 3) SVM + SMOTE-IPF ==================
smote_ipf = SMOTE_IPF(
    sampling_strategy=1.0,
    smote_k=5,
    ipf_k=3,
    max_iter=10,
    random_state=RANDOM_STATE,
    verbose=True
)

X_smote_ipf, y_smote_ipf = smote_ipf.fit_resample(X_train_scaled, y_train)

svm_smote_ipf = SVC(
    kernel='linear',
    C=1.0,
    class_weight='balanced',
    random_state=RANDOM_STATE
)
svm_smote_ipf.fit(X_smote_ipf, y_smote_ipf)
y_pred_smote_ipf = svm_smote_ipf.predict(X_test_scaled)

evaluate_model("SVM + SMOTE-IPF (Proposed)", y_test, y_pred_smote_ipf)

After SMOTE: 5754 samples, dist: [2877 2877]
IPF iter 1: removed 56 samples
IPF iter 2: removed 3 samples
IPF converged at iter 3
Final: 5695 samples, dist: [2877 2818]

F1-score      : 0.351
Recall kelas 0: 0.644
Recall kelas 1: 0.636
Bias gap      : 0.008

Confusion Matrix (lengkap):
[[463 256]
 [ 47  82]]

Detail:
TN = 463, FP = 256, FN = 47, TP = 82

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.64      0.75       719
           1       0.24      0.64      0.35       129

    accuracy                           0.64       848
   macro avg       0.58      0.64      0.55       848
weighted avg       0.81      0.64      0.69       848

