Creating a Python function that takes in actual labels and predicted labels and returns these key metrics (Accuracy, Precision, Recall, F1-score, ROC-AUC, and the Confusion Matrix).

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def evaluate_model(y_true, y_pred, y_pred_proba=None):

# Initialize dictionary to hold all evaluation metrics
    metrics = {}

    # Calculate and store the evaluation metrics
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['Recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['F1-Score'] = f1_score(y_true, y_pred, zero_division=0)

    # Confusion matrix: True Positives, False Positives, True Negatives, False Negatives
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['Confusion Matrix'] = {
        'True Positives': tp,
        'False Positives': fp,
        'True Negatives': tn,
        'False Negatives': fn
    }
    
    # ROC-AUC Score (only if predicted probabilities are provided)
    if y_pred_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba)
    
    return metrics


Example of using it in practice:

In [None]:
# Assuming `y_test` are actual labels and `y_pred` are predicted labels
# If you have predicted probabilities (for ROC-AUC), include `y_pred_proba` as well

# Example
y_pred = rf_model.predict(X_test_scaled)
y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for the positive class (fraud)

# Call the evaluation function
evaluation_results = evaluate_model(y_test, y_pred, y_pred_proba)

# Print results
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")

Visualising Confusion Matrix:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Non-Fraud', 'Fraud'],
                yticklabels=['Non-Fraud', 'Fraud'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred)

In [11]:
import import_ipynb 
import main as m


[9.80444220e-02 4.62091097e-05 6.34295397e-02 ... 1.23963710e-01
 4.62467775e-01 4.05377998e-01]
[0.1122873  0.00246492 0.28776439 ... 0.08284157 0.18002027 0.50429234]


In [13]:
data = m.data_processing()
data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,target
0,-0.313846,1.79858,-0.504239,-1.341465,0.278369,-1.478695,0.680652,-0.82565,0.2892,0.184575,0.178209,0.51627,-0.194868,-0.234672,-0.388698,-0.58851,1
1,0.358223,-0.832108,-0.034217,1.080296,-0.858422,1.413999,-0.200257,-0.601431,0.142532,-0.349596,0.097097,0.666427,-0.212606,-0.172135,-0.090313,0.006921,1
2,-1.216144,-0.884816,-0.509105,-0.426303,-0.990615,0.986792,0.26667,0.037183,0.117513,0.196744,0.092343,0.03109,0.831808,0.292165,0.168038,0.039964,0
3,-0.039088,1.365297,-0.331501,-0.789622,0.158097,-0.944652,0.314758,-0.460765,-0.155372,0.381961,0.606867,-0.833744,-0.365964,-0.157098,-0.231825,0.063099,1
4,-1.44544,-1.640353,-0.361803,-0.028223,-1.127603,1.511789,0.089378,-0.179459,0.42718,-0.21368,-0.198707,0.57519,-0.437308,0.221262,0.104004,-0.201012,0


Evaluation of models:

In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def evaluate_models_auc(models, params, logs=False):
    """
    Evaluates a list of models by training each model with the corresponding parameters 
    and calculates ROC-AUC, returning models sorted by their AUC-ROC score in descending order.

    Parameters:
        models (list): A list of model functions (e.g., LogisticRegression, RandomForestClassifier).
        params (list): A list of dictionaries containing the parameters for each model.
        logs (bool): Whether to log the data processing steps (default: False).
        
    Returns:
        list of tuples: A list of (model_name, ROC-AUC score) tuples, sorted by AUC-ROC score in descending order.
    """
    
    # Use the data_processing function to load and preprocess the dataset
    df = data_processing(logs=logs)

    # Separate the PCA components (PC1, PC2, ..., PC16) as features and target as the response variable
    X = df.drop(columns=['target'])  # All columns except 'target' are PCA components
    y = df['target']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    # List to store the model names and AUC-ROC scores
    roc_auc_results = []
    
    # Iterate through the models and params lists
    for model_func, param_dict in zip(models, params):
        
        # Initialize the model with the given parameters
        model = model_func(**param_dict)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Get predicted probabilities (for ROC-AUC, we need probabilities, not class labels)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
        
        # Calculate the ROC-AUC score
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        # Store the model name and its ROC-AUC score
        roc_auc_results.append((model_func.__name__, roc_auc))
    
    # Sort the models by their AUC-ROC score in descending order
    roc_auc_results.sort(key=lambda x: x[1], reverse=True)
    
    return roc_auc_results

# Example usage:

# Import some classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Define the models (as functions) and corresponding parameters
models = [LogisticRegression, RandomForestClassifier, SVC]
params = [
    {'random_state': 42, 'max_iter': 1000},  # Parameters for Logistic Regression
    {'n_estimators': 100, 'random_state': 42},  # Parameters for Random Forest
    {'probability': True, 'random_state': 42}  # Parameters for SVC (needs probability=True for predict_proba)
]

# Call the evaluation function with PCA-transformed data
roc_auc_results = evaluate_models_auc(models, params, logs=True)

# Print the sorted ROC-AUC scores
print("Models sorted by ROC-AUC score:")
for model_name, auc_score in roc_auc_results:
    print(f"{model_name}: {auc_score}")


NameError: name 'data_processing' is not defined