In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

# Function to load and explore data
def load_and_explore_data(filepath):
    data = pd.read_csv(filepath)
    print("First few rows of the dataset:")
    print(data.head())
    print("\nDataset Information:")
    print(data.info())
    print("\nSummary statistics of the dataset:")
    print(data.describe())
    
    # Plotting distributions
    data.hist(figsize=(12, 10), bins=20)
    plt.suptitle("Feature Distributions")
    plt.show()
    
    # Plotting correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
    
    return data

# Function to split data
def split_data(data, target_column):
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Function to preprocess data
def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Function to perform grid search for SVM
def grid_search_svm(X_train, y_train):
    svm = SVC(probability=True)
    param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='roc_auc', verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to perform grid search for Random Forest
def grid_search_rf(X_train, y_train):
    rf = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    # Plotting confusion matrix
    plot_confusion_matrix(y_test, predictions)
    
    # Plotting ROC Curve
    plot_roc_curve(y_test, model.predict_proba(X_test)[:, 1], roc_auc)
    
    return accuracy, precision, recall, f1, roc_auc

# Function to plot confusion matrix
def plot_confusion_matrix(y_test, predictions):
    conf_matrix = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Class')
    plt.xlabel('Predicted Class')
    plt.show()

# Function to plot ROC Curve
def plot_roc_curve(y_test, y_proba, roc_auc):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

# Main function to run the whole process
def main():
    # Load and explore the dataset
    data = load_and_explore_data("diabetes.csv")
    
    # Split the dataset
    X_train, X_test, y_train, y_test = split_data(data, "Outcome")
    
    # Preprocess the data
    X_train_scaled, X_test_scaled = preprocess_data(X_train, X_test)
    
    # SVM Model with Grid Search
    print("\nTraining SVM model...")
    svm_grid = grid_search_svm(X_train_scaled, y_train)
    print("\nBest SVM Parameters:", svm_grid.best_params_)
    print("\nEvaluating SVM model...")
    svm_metrics = evaluate_model(svm_grid.best_estimator_, X_test_scaled, y_test)
    
    # Random Forest Model with Grid Search
    print("\nTraining Random Forest model...")
    rf_grid = grid_search_rf(X_train, y_train)
    print("\nBest Random Forest Parameters:", rf_grid.best_params_)
    print("\nEvaluating Random Forest model...")
    rf_metrics = evaluate_model(rf_grid.best_estimator_, X_test, y_test)
    
    # Comparison of the two models
    print("\nComparison of SVM and Random Forest Models:")
    metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"]
    print(f"{'Metric':<10} {'SVM':<10} {'Random Forest':<15}")
    for metric, svm_value, rf_value in zip(metrics, svm_metrics, rf_metrics):
        print(f"{metric:<10} {svm_value:<10.4f} {rf_value:<15.4f}")

if __name__ == "__main__":
main()