In [None]:

import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import shap
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

#Setup Direktori Output
os.makedirs('plots', exist_ok=True)
os.makedirs('reports', exist_ok=True)
os.makedirs('models', exist_ok=True)

#Inisialisasi File Laporan
report_file = 'reports/model_evaluation.txt'

def write_report(content):
    with open(report_file, 'a') as f:
        f.write(str(content) + "\n")

#Header laporan sederhana
with open(report_file, 'w') as f:
    f.write("EVALUASI MODEL DIABETES\n")
    f.write("=" * 50 + "\n\n")

# Load Data
df = pd.read_csv('diabetes.csv', delimiter=';')
df.columns = df.columns.str.strip()

#Pemeriksaan Missing Values
write_report("JUMLAH MISSING VALUES PER KOLOM")
write_report(df.isna().sum())

#Persiapan Data
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = df[features]
y = df['Outcome']

#Pembersihan Data
df_clean = df.dropna(subset=['Outcome'])
X_clean = df_clean[features]
y_clean = df_clean['Outcome']
X, y = X_clean, y_clean

#Analisis Eksploratif
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_clean, x='Age', y='Glucose', hue='Outcome', palette='Set1')
plt.title('Age vs Glucose by Outcome')
plt.tight_layout()
plt.savefig('plots/scatter_age_glucose.png', dpi=300)
plt.close()

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
outcome_counts = y.value_counts().sort_index()

axes[0].pie(outcome_counts, labels=['Non-Diabetes', 'Diabetes'],
            autopct='%1.1f%%', colors=['lightblue', 'salmon'], startangle=90)
axes[0].set_title('Distribusi Outcome (Pie)')

sns.barplot(x=outcome_counts.index, y=outcome_counts.values,
            ax=axes[1], palette=['lightblue', 'salmon'])
axes[1].set_xticklabels(['Non-Diabetes', 'Diabetes'])
axes[1].set_title('Distribusi Outcome (Bar)')
axes[1].set_ylabel('Jumlah')
axes[1].bar_label(axes[1].containers[0], fmt='%d')

plt.tight_layout()
plt.savefig('plots/pie_outcome.png', dpi=300)
plt.close()

plt.figure(figsize=(12, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(3, 3, i)
    sns.histplot(data=df_clean, x=feature, kde=True, bins=15)
    plt.title(f'Distribusi {feature}')
plt.tight_layout()
plt.savefig('plots/feature_distributions.png', dpi=300)
plt.close()

plt.figure(figsize=(10, 8))
corr_matrix = df_clean[features + ['Outcome']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Matriks Korelasi')
plt.tight_layout()
plt.savefig('plots/correlation_matrix.png', dpi=300)
plt.close()

#Pemrosesan Data (SMOTE)
write_report("\nPEMROSESAN DATA (SMOTE)")
write_report("Menerapkan SMOTE untuk menangani imbalance class...")

try:
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    write_report("Setelah SMOTE:")
    write_report(pd.Series(y_res).value_counts())
except Exception as e:
    write_report(f"Error dalam SMOTE: {str(e)}")
    raise

#Pembuatan Model
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

model = grid.best_estimator_

#Evaluasi Model
write_report("\nEVALUASI MODEL")
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

write_report(f"Akurasi: {acc:.4f}")
write_report("Confusion Matrix:")
write_report(confusion_matrix(y_test, y_pred))
write_report("Classification Report:")
write_report(classification_report(y_test, y_pred))

#Feature Importance
importances = model.named_steps['classifier'].feature_importances_
feature_importance = pd.Series(importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feature_importance.sort_values().plot.barh(color='teal')
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('plots/feature_importance.png', dpi=300)
plt.close()

#SHAP Analysis
try:
    explainer = shap.TreeExplainer(model.named_steps['classifier'])
    shap_values = explainer.shap_values(X_test)

    plt.figure()
    shap.summary_plot(shap_values[1], X_test, feature_names=features, show=False)
    plt.tight_layout()
    plt.savefig('plots/shap_summary.png', bbox_inches='tight', dpi=300)
    plt.close()
except Exception as e:
    print(f"Error dalam membuat SHAP plot: {str(e)}")

#Penyimpanan Model
model_path = 'models/diabetes_model.pkl'
joblib.dump(model, model_path)

#Pemberitahuan
write_report("\nSELESAI")
write_report(f"Akurasi akhir model: {acc:.4f}")
print(f"\n✅ Proses selesai! Laporan tersedia di: {report_file}")
