In [20]:
!pip install -U imbalanced-learn



In [21]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

root_path = 'gdrive/My Drive/Colab Notebooks/Deep Learning Labs/Framingham/'

print("Path root:", root_path)

Mounted at /content/gdrive
Path root: gdrive/My Drive/Colab Notebooks/Deep Learning Labs/Framingham/


In [22]:
# -------------------------
# Imports rekomendasi untuk eksperimen SMOTE / SMOTE-IPF / CV / tuning
# -------------------------
import os
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Reproducibility: gunakan nilai tunggal untuk random_state sepanjang notebook
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# sklearn: preprocessing, model, evaluasi, CV, dan util lainnya
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve, auc

# imbalanced-learn: pipeline + samplers (SMOTE + variants)
from imblearn.pipeline import Pipeline as ImbPipeline   # alias supaya tidak bingung dengan sklearn.pipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# plotting (opsional tapi berguna untuk presentasi)
import matplotlib.pyplot as plt

# untuk implementasi custom SMOTE-IPF, pastikan class/func SMOTE_IPF sudah didefinisikan/imported
# from your_module import SMOTE_IPF   # contoh jika kamu punya file implementasi SMOTE_IPF


In [23]:
# -------------------------
# Load dataset
# -------------------------
df = pd.read_csv(os.path.join(root_path, "framingham.csv"))

# Info dasar dataset
print("Ukuran dataset:", df.shape)
print("\nJumlah missing value per kolom:\n", df.isnull().sum())

# Cek tipe data
print("\nInfo dataset:")
print(df.info())

# (Opsional) lihat distribusi target
if "TenYearCHD" in df.columns:
    print("\nDistribusi label target (TenYearCHD):")
    print(df["TenYearCHD"].value_counts(normalize=True))


Ukuran dataset: (4240, 16)

Jumlah missing value per kolom:
 male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

Info dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     

In [24]:
# -------------------------
# Tangani missing values
# -------------------------
from sklearn.impute import SimpleImputer

# Pisahkan kolom numerik dan kategorikal
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

# Imputer untuk numerik (pakai median biar robust ke outlier)
imputer_num = SimpleImputer(strategy="median")
df[num_cols] = imputer_num.fit_transform(df[num_cols])

# Imputer untuk kategorikal (kalau ada, pakai modus)
if len(cat_cols) > 0:
    imputer_cat = SimpleImputer(strategy="most_frequent")
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

print("Jumlah missing value setelah imputasi:\n", df.isnull().sum().sum())


Jumlah missing value setelah imputasi:
 0


In [25]:
from sklearn.model_selection import train_test_split

X = df.drop("TenYearCHD", axis=1)
y = df["TenYearCHD"]

# Stratify untuk menjaga proporsi kelas tetap sama di train & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, stratify=y, random_state=42
)

print("Distribusi kelas di train:", np.bincount(y_train))
print("Distribusi kelas di test :", np.bincount(y_test))


Distribusi kelas di train: [2517  451]
Distribusi kelas di test : [1079  193]


In [26]:
from sklearn.preprocessing import StandardScaler

# Normalisasi fitur agar semua punya skala yang sebanding
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)  # scaler "belajar" dari train
X_test_scaled = scaler.transform(X_test)        # hanya transform, tidak fit ulang


In [27]:
print("Jumlah sampel sebelum SMOTE-IPF:", X_train.shape)
print("Distribusi kelas sebelum SMOTE-IPF:", np.bincount(y_train))

Jumlah sampel sebelum SMOTE-IPF: (2968, 15)
Distribusi kelas sebelum SMOTE-IPF: [2517  451]


In [28]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Baseline SVM (tanpa SMOTE/IPF)
svm = SVC(random_state=42)  # random_state biar konsisten
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)

print("=== Baseline SVM (Tanpa SMOTE-IPF) ===")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


=== Baseline SVM (Tanpa SMOTE-IPF) ===
Classification Report:
               precision    recall  f1-score   support

         0.0       0.85      1.00      0.92      1079
         1.0       0.62      0.03      0.05       193

    accuracy                           0.85      1272
   macro avg       0.74      0.51      0.48      1272
weighted avg       0.82      0.85      0.79      1272

F1-Score: 0.04975124378109453
Confusion Matrix:
 [[1076    3]
 [ 188    5]]


In [29]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from imblearn.pipeline import Pipeline

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipeline_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

scores = cross_val_score(pipeline_svm, X, y, cv=cv, scoring="f1")
print("Rata-rata F1-Score CV (Tanpa SMOTE-IPF):", scores.mean())


Rata-rata F1-Score CV (Tanpa SMOTE-IPF): 0.03892389763269924


In [30]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# --- Oversampling dengan SMOTE ---
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Jumlah setelah SMOTE:", X_resampled.shape)
print("Distribusi kelas setelah SMOTE:", np.bincount(y_resampled))

# --- Training SVM dengan data hasil SMOTE ---
svm_smote = SVC(kernel='linear', C=10, gamma=0.1, class_weight='balanced', random_state=42)
svm_smote.fit(X_resampled, y_resampled)
y_pred_smote = svm_smote.predict(X_test_scaled)

print("\n=== SVM + SMOTE (Tuned) ===")
print("Classification Report:\n", classification_report(y_test, y_pred_smote))
print("F1-Score:", f1_score(y_test, y_pred_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))

# --- Cross-validation dengan pipeline ---
pipeline_smote_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("svm", SVC(random_state=42))
])

scores_smote = cross_val_score(pipeline_smote_svm, X, y, cv=cv, scoring="f1")
print("\nRata-rata F1-Score CV (SVM + SMOTE):", scores_smote.mean())


Jumlah setelah SMOTE: (5034, 15)
Distribusi kelas setelah SMOTE: [2517 2517]

=== SVM + SMOTE (Tuned) ===
Classification Report:
               precision    recall  f1-score   support

         0.0       0.90      0.64      0.75      1079
         1.0       0.24      0.62      0.34       193

    accuracy                           0.64      1272
   macro avg       0.57      0.63      0.55      1272
weighted avg       0.80      0.64      0.69      1272

F1-Score: 0.3438395415472779
Confusion Matrix:
 [[694 385]
 [ 73 120]]

Rata-rata F1-Score CV (SVM + SMOTE): 0.32868597964721846


In [31]:
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

class SMOTE_IPF:
    def __init__(self, smote_k=15, ipf_k=10, max_iter=5, random_state=42, verbose=False):
        self.smote_k = smote_k
        self.ipf_k = ipf_k
        self.max_iter = max_iter
        self.verbose = verbose
        self.smote = SMOTE(k_neighbors=self.smote_k, random_state=random_state)

    def fit_resample(self, X, y):
        # Step 1: SMOTE
        X_resampled, y_resampled = self.smote.fit_resample(X, y)

        # Gabungkan X dan y
        y_array = np.array(y_resampled).reshape(-1, 1)
        data = np.hstack((X_resampled.copy(), y_array))

        # Step 2: Iterative Partitioning Filter (IPF)
        for i in range(self.max_iter):
            clf = KNeighborsClassifier(n_neighbors=self.ipf_k)
            X_curr = data[:, :-1]
            y_curr = data[:, -1]

            clf.fit(X_curr, y_curr)
            y_pred = clf.predict(X_curr)

            # Temukan sample yang salah klasifikasi
            misclassified = y_pred != y_curr

            if self.verbose:
                print(f"Iterasi {i+1}: {misclassified.sum()} data dibuang")

            # Jika tidak ada yang misclassified â†’ stop
            if not np.any(misclassified):
                break

            # Hapus sample yang salah klasifikasi
            data = data[~misclassified]

        return data[:, :-1], data[:, -1].astype(int)


In [32]:
# --- Oversampling dengan SMOTE-IPF ---
smote_ipf = SMOTE_IPF(smote_k=5, ipf_k=3, max_iter=5, random_state=42, verbose=True)
X_resampled_ipf, y_resampled_ipf = smote_ipf.fit_resample(X_train_scaled, y_train)

print("Jumlah setelah SMOTE-IPF:", X_resampled_ipf.shape)
print("Distribusi kelas setelah SMOTE-IPF:", np.bincount(y_resampled_ipf))


Iterasi 1: 451 data dibuang
Iterasi 2: 93 data dibuang
Iterasi 3: 12 data dibuang
Iterasi 4: 1 data dibuang
Iterasi 5: 0 data dibuang
Jumlah setelah SMOTE-IPF: (4477, 15)
Distribusi kelas setelah SMOTE-IPF: [1966 2511]


In [33]:
print("Jumlah sesudah SMOTE-IPF:", X_resampled_ipf.shape, np.bincount(y_resampled_ipf))


Jumlah sesudah SMOTE-IPF: (4477, 15) [1966 2511]


In [34]:
# Asumsikan X_resampled_ipf, y_resampled_ipf sudah dibuat
# Jika belum, ganti X_resampled_ipf/y_resampled_ipf dengan X_resampled/y_resampled (tapi usahakan beri nama ipf)

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# 1) Train SVM pada data hasil SMOTE-IPF
svm_resampled = SVC(kernel='linear', C=10, gamma=0.1, class_weight='balanced', random_state=42)   # tambahkan random_state
svm_resampled.fit(X_resampled_ipf, y_resampled_ipf)

# 2) Prediksi di test set (gunakan X_test_scaled yang sudah kamu buat)
y_pred_resampled = svm_resampled.predict(X_test_scaled)

# 3) Evaluasi lengkap
print("=== SVM + SMOTE-IPF (Test set) ===")
print("Classification Report:\n", classification_report(y_test, y_pred_resampled))
print("F1-Score (pos=1):", f1_score(y_test, y_pred_resampled, pos_label=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_resampled))


=== SVM + SMOTE-IPF (Test set) ===
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.63      0.75      1079
         1.0       0.24      0.64      0.35       193

    accuracy                           0.64      1272
   macro avg       0.57      0.64      0.55      1272
weighted avg       0.81      0.64      0.69      1272

F1-Score (pos=1): 0.34831460674157305
Confusion Matrix:
 [[684 395]
 [ 69 124]]


In [None]:
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Stratified CV dengan shuffle biar merata
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Pipeline: scaler -> SMOTE-IPF -> SVM
pipeline_resampled = make_pipeline_imb(
    StandardScaler(),
    SMOTE_IPF(smote_k=5, ipf_k=3, max_iter=5),  # param bisa kamu tune
    SVC(random_state=42)
)

# Cross-validation
scores_resampled = cross_val_score(pipeline_resampled, X, y, cv=cv, scoring="f1", n_jobs=-1)

print("Rata-rata F1-Score CV (SMOTE-IPF + SVM):", scores_resampled.mean())
print("Std dev F1-Score CV:", scores_resampled.std())
