In [None]:
"""AAI 500 - Final Project
"""
# ===========================
# Random Forest Analysis on Obesity Levels
# ===========================

# ===========================
# Feature Importance Analysis
# ===========================

## Feature importance from previously trained data
feature_importances = rf_clf.feature_importances_
features = X.columns

## Series of feature importances
sorted_importances = pd.Series(feature_importances, index=features).sort_values(ascending=False)

## Plotting feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=sorted_importances, y=sorted_importances.index)
plt.title('Feature Importances from Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
## Downloads path
downloads_path = '/Users/payalpatel/Downloads'
plt.savefig(os.path.join(downloads_path, 'feature_importances_rf.png'))
plt.show()

# ===========================
# Confusion Matrix Visualization
# ===========================
from sklearn.metrics import confusion_matrix as sk_confusion_matrix, roc_auc_score, roc_curve
## Confusion matrix for predictions and test labels
conf_matrix = sk_confusion_matrix(y_test, y_pred_rf)  # Use the aliased function name

## Create a display for the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.xlabel('Predicted Obesity Level')
plt.ylabel('Actual Obesity Level')
plt.savefig(os.path.join(downloads_path, 'confusion_matrix.png'))  # Save to Downloads
plt.show()

# ==========================
# ROC Curve and AUC Calculation
# ==========================

## Import from necessary library
from sklearn.metrics import roc_auc_score, roc_curve

## ROC Curve and AUC for multiclass classification
roc_auc = roc_auc_score(y_test, rf_clf.predict_proba(X_test_scaled), multi_class="ovr")

## Handle this for each class separately
## False positive rate
false_positive_rates = {}
## True positive rate
true_positive_rates = {}
roc_thresholds = {}
num_classes = len(le.classes_)

## ROC curve for each class 
for i in range(num_classes):
    false_positive_rates[i], true_positive_rates[i], roc_thresholds[i] = roc_curve(y_test,
                                                                                   rf_clf.predict_proba(X_test_scaled)[
                                                                                   :, i], pos_label=i)

## Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(false_positive_rates[0], true_positive_rates[0], color='green',
         label='Class 0 ROC Curve (AUC = {:.2f})'.format(roc_auc), linewidth=2)
plt.plot([0, 1], [0, 1], color='blue', linestyle='--', label='Random Guessing Line')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Class 0')
plt.legend()
plt.savefig(os.path.join(downloads_path, 'roc_curve.png'))
plt.show()

# =========================
# Cross Validation Scores
# =========================

## Import from the necessary library
from sklearn.model_selection import train_test_split, cross_val_score

## Use cross-validation to test the accuracy of the Random Forest model with the scaled data
cross_val_scores = cross_val_score(rf_clf, X_train_scaled, y_train, cv=5, scoring='accuracy')

## Print the avg. accuracy and how much it varies from the cross-validation results
print(f"Random Forest Cross Validation Accuracy: {cross_val_scores.mean():.4f} Â± {cross_val_scores.std():.4f}")