In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN 

# ---------------------------------------------------------
# 3.1 & 3.3.1 Pengumpulan dan Explorasi Data Awal
# ---------------------------------------------------------
# Pastikan file 'framingham.csv' ada di direktori yang sama
df = pd.read_csv('framingham.csv')

print("Info Dataset:")
print(df.info())
print("\nDistribusi Kelas Target (TenYearCHD):")
print(df['TenYearCHD'].value_counts())

# ---------------------------------------------------------
# 3.3.4 Pre-Processing Data
# ---------------------------------------------------------

# 1. Penanganan Missing Values (Imputasi Mean)
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 2. Normalisasi (Min-Max Scaling)
# Sesuai teks: dilakukan sebelum split
X = df_imputed.drop('TenYearCHD', axis=1)
y = df_imputed['TenYearCHD']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# ---------------------------------------------------------
# 3.3.5 Split Data
# ---------------------------------------------------------
# Proporsi 80:20, Stratified
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nJumlah Data Latih: {X_train.shape[0]}")
print(f"Jumlah Data Uji: {X_test.shape[0]}")

# Fungsi untuk melatih dan evaluasi model
def evaluate_model(X_tr, y_tr, X_te, y_te, scenario_name):
    print(f"\n{'='*20} {scenario_name} {'='*20}")
    
    # Inisialisasi SVM
    svm_model = SVC(kernel='rbf', random_state=42)
    
    # Pelatihan
    svm_model.fit(X_tr, y_tr)
    
    # Prediksi
    y_pred = svm_model.predict(X_te)
    
    # Evaluasi
    print("Confusion Matrix:")
    print(confusion_matrix(y_te, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_te, y_pred))
    print(f"Accuracy: {accuracy_score(y_te, y_pred):.4f}")

# ---------------------------------------------------------
# 3.3.7 Evaluasi Model (3 Skenario)
# ---------------------------------------------------------

# Skenario 1: Model Baseline (Tanpa Oversampling)
evaluate_model(X_train, y_train, X_test, y_test, "Skenario 1: Baseline (Tanpa Oversampling)")

# Skenario 2: Model SMOTE
# Diterapkan hanya pada Data Latih (Post-split)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\n[Info] Data Latih setelah SMOTE: {X_train_smote.shape}")
evaluate_model(X_train_smote, y_train_smote, X_test, y_test, "Skenario 2: SMOTE")

# Skenario 3: Model SMOTE-IPF (Diwakili oleh SMOTE + Cleaning)
# Menggunakan SMOTEENN untuk simulasi SMOTE + Filtering outlier
smote_ipf = SMOTEENN(random_state=42)
X_train_ipf, y_train_ipf = smote_ipf.fit_resample(X_train, y_train)

print(f"\n[Info] Data Latih setelah SMOTE-Filtering: {X_train_ipf.shape}")
evaluate_model(X_train_ipf, y_train_ipf, X_test, y_test, "Skenario 3: SMOTE + Filtering (IPF Approach)")

Info Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB

In [2]:

# ---------------------------------------------------------
# FILTERING FITUR (Hanya 4 Fitur Utama + Target)
# ---------------------------------------------------------
# Kita memilih fitur yang umumnya memiliki korelasi tertinggi
selected_features = ['age', 'sysBP', 'cigsPerDay', 'totChol', 'TenYearCHD']
df_selected = df[selected_features]

print(f"Dimensi Data Baru: {df_selected.shape}")
print("Fitur yang digunakan:", list(df_selected.columns[:-1]))

# ---------------------------------------------------------
# Pre-Processing
# ---------------------------------------------------------
# 1. Imputasi Mean
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df_selected), columns=selected_features)

# 2. Normalisasi MinMax
X = df_imputed.drop('TenYearCHD', axis=1)
y = df_imputed['TenYearCHD']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 3. Split Data (80:20 Stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------------------------------------
# Fungsi Evaluasi
# ---------------------------------------------------------
def run_experiment(X_tr, y_tr, X_te, y_te, scenario):
    print(f"\n>>> {scenario}")
    # Menggunakan class_weight='balanced' untuk membantu menaikkan sensitivitas ke kelas minoritas
    model = SVC(kernel='rbf', C=1.0, random_state=42, class_weight='balanced') 
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    
    print(classification_report(y_te, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_te, y_pred))

# ---------------------------------------------------------
# 1. Tanpa Oversampling (4 Fitur)
# ---------------------------------------------------------
run_experiment(X_train, y_train, X_test, y_test, "Skenario 1: 4 Fitur - Tanpa SMOTE")

# ---------------------------------------------------------
# 2. SMOTE (4 Fitur)
# ---------------------------------------------------------
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

run_experiment(X_train_smote, y_train_smote, X_test, y_test, "Skenario 2: 4 Fitur - SMOTE")

# ---------------------------------------------------------
# 3. SMOTE-IPF / SMOTEENN (4 Fitur)
# ---------------------------------------------------------
# SMOTEENN digunakan sebagai implementasi teknis untuk SMOTE + Cleaning
smote_ipf = SMOTEENN(random_state=42)
X_train_ipf, y_train_ipf = smote_ipf.fit_resample(X_train, y_train)

run_experiment(X_train_ipf, y_train_ipf, X_test, y_test, "Skenario 3: 4 Fitur - SMOTE-IPF")

Dimensi Data Baru: (4240, 5)
Fitur yang digunakan: ['age', 'sysBP', 'cigsPerDay', 'totChol']

>>> Skenario 1: 4 Fitur - Tanpa SMOTE
              precision    recall  f1-score   support

         0.0       0.90      0.63      0.74       719
         1.0       0.23      0.62      0.34       129

    accuracy                           0.63       848
   macro avg       0.57      0.63      0.54       848
weighted avg       0.80      0.63      0.68       848

Confusion Matrix:
 [[453 266]
 [ 49  80]]

>>> Skenario 2: 4 Fitur - SMOTE
              precision    recall  f1-score   support

         0.0       0.90      0.62      0.74       719
         1.0       0.23      0.63      0.34       129

    accuracy                           0.62       848
   macro avg       0.57      0.62      0.54       848
weighted avg       0.80      0.62      0.67       848

Confusion Matrix:
 [[446 273]
 [ 48  81]]

>>> Skenario 3: 4 Fitur - SMOTE-IPF
              precision    recall  f1-score   support

     