# Import Library

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load Data

In [54]:
df = pd.read_csv("stroke-final-fixed3.csv")
print("Preview Data:")
print(df.head())

Preview Data:
   sex   age  hypertension  heart_disease  ever_married  work_type  \
0  1.0  43.0             0              0             0          4   
1  1.0  47.0             0              0             1          4   
2  0.0  35.0             0              0             1          4   
3  0.0  62.0             1              0             1          2   
4  0.0  88.0             0              0             1          3   

   Residence_type  avg_glucose_level   bmi  smoking_status  stroke  
0               0              69.38  28.4               0       0  
1               0              96.59  26.4               0       1  
2               1             104.51  27.3               1       1  
3               0             116.55  31.1               1       1  
4               0              90.00  32.0               0       1  


# Split Fitur and Target

In [55]:
X = df.drop(columns=['stroke'])
y = df['stroke']

print("\nJumlah fitur:", X.shape[1])
print("Distribusi kelas:\n", y.value_counts())


Jumlah fitur: 10
Distribusi kelas:
 stroke
0    5000
1    5000
Name: count, dtype: int64


# Split Train and Test

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nUkuran data:")
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


Ukuran data:
Train: (8000, 10), Test: (2000, 10)


# Latih Model

In [57]:
model = SVC(kernel='rbf', probability=True)
model.fit(X_train, y_train)

# Prediksi dan evaluasi

In [58]:
y_pred = model.predict(X_test)

print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Akurasi: 0.6405

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.79      0.69      1000
           1       0.70      0.49      0.58      1000

    accuracy                           0.64      2000
   macro avg       0.65      0.64      0.63      2000
weighted avg       0.65      0.64      0.63      2000


Confusion Matrix:
[[789 211]
 [508 492]]


In [59]:
def evaluate(model, name=""):
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Tuning hyperparameter dengan GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.001],
    'kernel': ['rbf']
}

print("\nMelakukan pencarian hyperparameter terbaik dengan GridSearchCV...")
grid = GridSearchCV(SVC(random_state=42), param_grid, cv=5, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

# Tampilkan hasil terbaik
print("\n--- Hasil Grid Search ---")
print("Best Parameters:", grid.best_params_)

# Evaluasi model terbaik
best_model = grid.best_estimator_
evaluate(best_model, name="Tuned SVM (rbf)")


Melakukan pencarian hyperparameter terbaik dengan GridSearchCV...
Fitting 5 folds for each of 9 candidates, totalling 45 fits

--- Hasil Grid Search ---
Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}

Tuned SVM (rbf) Accuracy: 0.7660
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      1000
           1       0.76      0.77      0.77      1000

    accuracy                           0.77      2000
   macro avg       0.77      0.77      0.77      2000
weighted avg       0.77      0.77      0.77      2000

Confusion Matrix:
[[758 242]
 [226 774]]


# Testing

In [62]:
import numpy as np
import pandas as pd

# Set seed agar hasil reproducible
np.random.seed(42)

# Buat 300 data berisiko tinggi stroke
high_risk = pd.DataFrame({
    'sex': np.random.randint(0, 2, 300),
    'age': np.random.uniform(60, 80, 300),  # usia tua
    'hypertension': np.ones(300),          # semua hipertensi
    'heart_disease': np.ones(300),         # semua punya penyakit jantung
    'ever_married': np.ones(300),          # sudah menikah (sering diasosiasikan dengan usia tua)
    'work_type': np.random.randint(0, 5, 300),
    'Residence_type': np.random.randint(0, 2, 300),
    'avg_glucose_level': np.random.uniform(180, 300, 300),  # glukosa tinggi
    'bmi': np.random.uniform(30, 40, 300),  # obesitas
    'smoking_status': np.random.randint(1, 3, 300),  # merokok (1 atau 2)
})

# Buat 700 data berisiko rendah stroke
low_risk = pd.DataFrame({
    'sex': np.random.randint(0, 2, 700),
    'age': np.random.uniform(20, 45, 700),
    'hypertension': np.zeros(700),
    'heart_disease': np.zeros(700),
    'ever_married': np.zeros(700),
    'work_type': np.random.randint(0, 5, 700),
    'Residence_type': np.random.randint(0, 2, 700),
    'avg_glucose_level': np.random.uniform(70, 100, 700),
    'bmi': np.random.uniform(18.5, 24.9, 700),
    'smoking_status': np.zeros(700),  # tidak merokok
})

# Gabungkan
X_test_custom = pd.concat([high_risk, low_risk], ignore_index=True)

# Lakukan prediksi
predictions = best_model.predict(X_test_custom)

# Hitung hasil distribusi prediksi
unique, counts = np.unique(predictions, return_counts=True)
prediction_distribution = dict(zip(unique, counts))

print("Distribusi Hasil Prediksi:")
print(prediction_distribution)


Distribusi Hasil Prediksi:
{0: 704, 1: 296}
