02_modeling.ipynb (Pemodelan dan Evaluasi)
Fokus: Melatih algoritma dan mengukur performa.

In [5]:
# --- SEL 1: Load Cleaned Data & Split ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib
import os

# Pastikan folder models ada
os.makedirs('../models', exist_ok=True)

# Load data
df_clean = pd.read_csv('../data/processed/diabetes_cleaned.csv')
X = df_clean.drop('Outcome', axis=1)
y = df_clean['Outcome']

# --- SEL 2: Preprocessing & Save Preprocessing Pipeline ---
# Buat dan simpan preprocessing pipeline
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Simpan preprocessing pipeline (scaling, dll)
preprocessing_pipeline = {
    'scaler': scaler,
    'feature_names': X.columns.tolist(),
    'n_features': X.shape[1]
}
joblib.dump(preprocessing_pipeline, '../models/preprocessing.pkl')

# Split data dengan data yang sudah di-scale
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# --- SEL 3: Training Model ---
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1
)
model.fit(X_train, y_train)

# --- SEL 4: Evaluasi ---
y_pred = model.predict(X_test)
print("=== Model Evaluation ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- SEL 5: Save Best Model ---
# Simpan model terbaik dengan metadata
best_model = {
    'model': model,
    'accuracy': accuracy_score(y_test, y_pred),
    'features': X.columns.tolist(),
    'n_features': X.shape[1],
    'random_state': 42,
    'model_type': 'RandomForestClassifier'
}

joblib.dump(best_model, '../models/best_model.pkl')

# --- SEL 6: Verifikasi ---
print("\n=== Files Saved Successfully ===")
print("1. preprocessing.pkl saved in ../models/")
print("   - Contains: scaler, feature_names, n_features")
print("2. best_model.pkl saved in ../models/")
print("   - Contains: trained model, accuracy, features metadata")

# Cek apakah file berhasil dibuat
import os
print(f"\nFile exists in models directory:")
print(f"- preprocessing.pkl: {os.path.exists('../models/preprocessing.pkl')}")
print(f"- best_model.pkl: {os.path.exists('../models/best_model.pkl')}")

=== Model Evaluation ===
Accuracy: 0.7403

Confusion Matrix:
[[85 15]
 [25 29]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       100
           1       0.66      0.54      0.59        54

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154


=== Files Saved Successfully ===
1. preprocessing.pkl saved in ../models/
   - Contains: scaler, feature_names, n_features
2. best_model.pkl saved in ../models/
   - Contains: trained model, accuracy, features metadata

File exists in models directory:
- preprocessing.pkl: True
- best_model.pkl: True
