In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [32]:
# Load the dataset
data = pd.read_csv('data.csv')

In [34]:
# Assuming the last column is the target, adjust as needed
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [36]:
# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

In [38]:
# Define the preprocessing steps: OneHotEncoder for categorical and StandardScaler for numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns)
    ])

In [40]:
# Define the models
models = {
    'SVM': SVC(kernel='rbf', probability=True),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'k-NN': KNeighborsClassifier(n_neighbors=5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
}

In [42]:
# Evaluate each model using cross-validation and additional metrics
results = {}
for model_name, model in models.items():
    # Create a pipeline that first transforms the data then fits the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Cross-validation scores
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Fit the model on the training data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Calculate evaluation metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    # Store the results
    results[model_name] = {
        'Cross-Validation Mean Accuracy': np.mean(cv_scores),
        'Test Set Accuracy': test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }

In [43]:
# Find the best model based on evaluation metrics
best_models = list(models.keys())

In [46]:
# Criteria list in order of evaluation
criteria = ['Test Set Accuracy', 'Cross-Validation Mean Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']

In [48]:
# Iterate through the criteria to narrow down the best model(s)
for criterion in criteria:
    max_value = max(results[model][criterion] for model in best_models if results[model][criterion] is not None)
    best_models = [model for model in best_models if results[model][criterion] == max_value]
    if len(best_models) == 1:
        break

In [50]:
# Print the results
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}" if value is not None else f"  {metric}: N/A")
    print()

SVM:
  Cross-Validation Mean Accuracy: 0.98
  Test Set Accuracy: 1.00
  Precision: 1.00
  Recall: 1.00
  F1-Score: 1.00
  ROC-AUC: 1.00

Random Forest:
  Cross-Validation Mean Accuracy: 1.00
  Test Set Accuracy: 1.00
  Precision: 1.00
  Recall: 1.00
  F1-Score: 1.00
  ROC-AUC: 1.00

Gradient Boosting:
  Cross-Validation Mean Accuracy: 1.00
  Test Set Accuracy: 1.00
  Precision: 1.00
  Recall: 1.00
  F1-Score: 1.00
  ROC-AUC: 1.00

k-NN:
  Cross-Validation Mean Accuracy: 0.77
  Test Set Accuracy: 0.75
  Precision: 0.84
  Recall: 0.75
  F1-Score: 0.75
  ROC-AUC: 0.90

Neural Network:
  Cross-Validation Mean Accuracy: 0.96
  Test Set Accuracy: 0.95
  Precision: 0.96
  Recall: 0.95
  F1-Score: 0.95
  ROC-AUC: 1.00



In [52]:
# Print the best model(s)
if len(best_models) > 1:
    print("Best Models (tie):")
else:
    print("Best Model:")

for model in best_models:
    print(f"  {model}")
    print(f"  Test Set Accuracy: {results[model]['Test Set Accuracy']:.2f}")
    print(f"  Cross-Validation Mean Accuracy: {results[model]['Cross-Validation Mean Accuracy']:.2f}")
    print(f"  Precision: {results[model]['Precision']:.2f}")
    print(f"  Recall: {results[model]['Recall']:.2f}")
    print(f"  F1-Score: {results[model]['F1-Score']:.2f}")
    print(f"  ROC-AUC: {results[model]['ROC-AUC']:.2f}" if results[model]['ROC-AUC'] is not None else "  ROC-AUC: N/A")
    print()

Best Models (tie):
  Random Forest
  Test Set Accuracy: 1.00
  Cross-Validation Mean Accuracy: 1.00
  Precision: 1.00
  Recall: 1.00
  F1-Score: 1.00
  ROC-AUC: 1.00

  Gradient Boosting
  Test Set Accuracy: 1.00
  Cross-Validation Mean Accuracy: 1.00
  Precision: 1.00
  Recall: 1.00
  F1-Score: 1.00
  ROC-AUC: 1.00

