In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix

# === 1. Load Dataset ===
df = pd.read_csv("../Data/combine-feature.csv")
X = df[["LF", "HF", "LF/HF Ratio", "SDNN", "RMSSD", "pNN50"]]
y = df["Class"]

# === 2. Scaling ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === 3. Model ===
lda = LinearDiscriminantAnalysis(
    solver='lsqr',
    shrinkage=0.9,
    priors=[0.3, 0.7],
    n_components=1
)

# === 4. Cross-validation setup ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === 5. Prediksi Cross-Val & Evaluasi ===
y_pred = cross_val_predict(lda, X_scaled, y, cv=skf)
cv_results = cross_validate(
    lda, X_scaled, y,
    cv=skf,
    scoring=["accuracy", "precision", "recall", "f1"],
    return_train_score=True
)

# === 6. Feature Coefficients ===
lda.fit(X_scaled, y)
coef = lda.coef_[0]
feat_imp = pd.Series(coef, index=X.columns).sort_values()

# === 7. Confusion Matrix ===
cm = confusion_matrix(y, y_pred)

# === 8. Train vs Cross-Val Performance ===
metrics = ["accuracy", "precision", "recall", "f1"]
train_means = [np.mean(cv_results[f"train_{m}"]) for m in metrics]
test_means = [np.mean(cv_results[f"test_{m}"]) for m in metrics]
test_stds = [np.std(cv_results[f"test_{m}"]) for m in metrics]

# === 9. Plotting ===
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle("LDA Performance Overview - Breast Cancer HRV", fontsize=15, weight='bold')

# --- (A) Feature Coefficients ---
sns.barplot(x=feat_imp.values, y=feat_imp.index, ax=axes[0], palette="coolwarm")
axes[0].set_title("Feature Coefficients (LDA)")
axes[0].set_xlabel("Coefficient Value")

# --- (B) Confusion Matrix ---
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=axes[1])
axes[1].set_title("Confusion Matrix (5-Fold CV)")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Actual")

# --- (C) Train vs Cross-Validation Performance ---
x = np.arange(len(metrics))
bar_width = 0.35
axes[2].bar(x - bar_width/2, train_means, bar_width, label="Train", color="skyblue")
axes[2].bar(x + bar_width/2, test_means, bar_width, yerr=test_stds, label="Test (CV)", color="salmon", capsize=4)
axes[2].set_xticks(x)
axes[2].set_xticklabels(["Accuracy", "Precision", "Recall", "F1"])
axes[2].set_ylim(0, 1.05)
axes[2].set_title("Train vs Cross-Validation Performance")
axes[2].set_ylabel("Score")
axes[2].legend()

# Tambahkan nilai di atas bar
for i, (train_val, test_val) in enumerate(zip(train_means, test_means)):
    axes[2].text(i - 0.18, train_val + 0.02, f"{train_val:.3f}", ha='center', fontweight='bold')
    axes[2].text(i + 0.18, test_val + 0.02, f"{test_val:.3f}", ha='center', fontweight='bold')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
