## Project Overview: Multi-Algorithm ML Pipeline

This notebook benchmarks several machine learning algorithms within a unified pipeline structure. It evaluates model performance under consistent preprocessing, validation, and metric reporting workflows.

Key aspects:
- Hyperparameter tuning across Logistic Regression, Decision Trees, SVMs, Gradient Boosting and Neural Network models.
- Tracks key metrics: accuracy, precision, recall, F1, AUC.
- Outputs SHAP values for feature importance exploration.
- Built for extensibility and reproducibility.

## SageMaker Pipeline

In [None]:
!pip install xgboost

In [None]:
pip install --upgrade scikit-learn

In [None]:
pip install catboost

In [None]:
pip install shap

In [None]:
# Import packages
import os
import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import argparse
import xgboost as xgb
import logging
import pipeline
import importlib
importlib.reload(pipeline)
import joblib
import tarfile
import gc # garbage collection (reclaim memory marked for deletion)
import math
import datetime
import sklearn
import fnmatch
#import lightgbm as lgb - can't use because of problem with dependency and Pandas. Investigate later.
import catboost as cb
import model_pipeline
importlib.reload(model_pipeline)
import logging
import shap
from sagemaker import get_execution_role
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.transformer import Transformer
from sagemaker.model import Model
from sagemaker import image_uris
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, confusion_matrix, roc_curve, auc, classification_report, roc_auc_score, precision_score as sk_precision_score, 
    recall_score, f1_score, log_loss, precision_recall_curve, average_precision_score
)
from sklearn.utils import resample
from sklearn.feature_selection import mutual_info_classif
from sklearn.base import TransformerMixin, BaseEstimator
from io import BytesIO
from scipy import sparse 
from scipy.sparse import save_npz
from scipy.stats import randint, uniform
from typing import List, Tuple, Optional
from itertools import product
from lightgbm import LGBMClassifier

In [None]:
# Initialise SageMaker
sagemaker_session = None  # Removed for public version
role = get_execution_role()

# Global variables
client_name = "Generic_Client"
training_task_name = "Generic_Client_Cloud"
target_column_name = "MODEL_TARGET"
partition_column_name = "TVH_PARTITION"
dataset_filename = "Generic_Client_Latest_DC.csv" # Update with the actual path
database_schema = ""
database_feature_table = ""
use_rodbc = False 
problem_type = "binary_classification"
bEnsemble = False
ENQ_KEY_VAR = "IDENTIFIER"
rand_runs = 1
high_card_cut_off = 100
ds_ratio = 3
positive_target = 1
negative_target = 0

# SageMaker-specific variables
s3_client = boto3.client('s3')
bucket = 'Generic_Bucket' # Replace with your S3 bucket name
prefix = 'Generic_Client' # Replace with the desired S3 prefix

# Setting up paths
train_data_s3_path = f's3://{bucket}/{prefix}/train_Generic_Client.csv'
validation_data_s3_path = f's3://{bucket}/{prefix}/validation_Generic_Client.csv'
test_data_s3_path = f's3://{bucket}/{prefix}/test_Generic_Client.csv'
model_output_s3_path = f's3://{bucket}/{prefix}/model_output'

In [None]:
pd.set_option('display.max_columns', None)

### Read from CSV

In [None]:
# Generate the S3 URL for the file
s3_uri = f's3://{bucket}/{prefix}/{dataset_filename}'

# Read data from S3 into a pandas DataFrame
df = pd.read_csv(s3_uri)

In [None]:
df.head()

### Exploratory Data Analysis (Optional)

In [None]:
df.info()

In [None]:
# Descriptive statistics
df.describe(include='all')

In [None]:
# Return tuple (number of rows, number of columns)
df.shape

In [None]:
# Perform a full EDA
pipeline.perform_eda(df, save_plots=True)

# Or, use individual functions for specific analyses
print(pipeline.summarize_data(data))
pipeline.plot_histograms(data, save_plots=True)

### Preprocessing

In [None]:
# Remove all columns not to be used for modelling, except for IDENTIFIER and TVH_PARTITION, which will be removed below.
df2 = df.drop(["column_1", "column_2"], axis=1)

In [None]:
df2.head()

In [None]:
df2.info()

In [None]:
# Remove columns with only one unique value (including nulls)
df2 = df2.loc[:, df.nunique(dropna=False) > 1]
print("Columns with only one unique value:\n", df.columns[df.nunique(dropna=False) == 1].tolist())

In [None]:
# Convert all DataFrame column names to upper case
df2.columns = [col.upper() for col in df2.columns]

# Check for model_target and tvh_partition in df. If absent, raise error
required_columns = ['MODEL_TARGET', 'TVH_PARTITION'] # TVH_PARTITION = train (value of 1), validation (value of 2) and holdout ((value of 3)) partitions.

# Check if required columns are in the DataFrame
if not all(col in df2.columns for col in required_columns):
    missing_cols = [col for col in required_columns if col not in df2.columns]
    raise ValueError(f"Missing required columns: {missing_cols}")

In [None]:
# If MODEL_TARGET contains any values other than 0 or 1, raise an error.
if df2['MODEL_TARGET'].isin([0, 1]).all() == False:
    raise ValueError("MODEL_TARGET contains invalid values (not 0 or 1).")

In [None]:
# Convert 'MODEL_TARGET' and 'TVH_PARTITION' columns to integer
df2['MODEL_TARGET'] = df2['MODEL_TARGET'].astype(int)
df2['TVH_PARTITION'] = df2['TVH_PARTITION'].astype(int)

In [None]:
df_high_card, high_card_cols = pipeline.drop_high_cardinality_columns(df2, max_unique_values=100)
print("High cardinality columns: \n", high_card_cols)

In [None]:
df_high_card.info()

In [None]:
# Split data into train, validation and test sets
train_df = df_high_card[df_high_card['TVH_PARTITION'] == 1]
validation_df = df_high_card[df_high_card['TVH_PARTITION'] == 2]
test_df = df_high_card[df_high_card['TVH_PARTITION'] == 3] # ADD value of 4 in future update!

print("Training set shape (rows, columns):", train_df.shape)
print("Validation set shape (rows, columns):", validation_df.shape)
print("Test set shape (rows, columns):", test_df.shape)

In [None]:
df_high_card[df_high_card['TVH_PARTITION'] == 1].head()

In [None]:
train_undersampled = pipeline.undersample_classification_task(train_df, ds_ratio)
print("Shape of undersampled training set (rows, columns): ", train_undersampled.shape)

# Number of columns in undersampled training set must be the same as in the original training set
assert train_undersampled.shape[1] == train_df.shape[1], f"Number of columns must be the same: {train_undersampled.shape[1]}, {train_df.shape[1]}"

In [None]:
# Define local file paths
train_path = f'/home/sagemaker-user/Product/train_Generic_Client_DS_{ds_ratio}.csv'
validation_path = f'/home/sagemaker-user/Product/validation_Generic_Client_DS_{ds_ratio}.csv'
test_path = f'/home/sagemaker-user/Product/test_Generic_Client_DS_{ds_ratio}.csv'

train_identifiers_path = f'/home/sagemaker-user/Product/train_identifiers_Generic_Client_DS_{ds_ratio}.csv'
validation_identifiers_path = f'/home/sagemaker-user/Product/validation_identifiers_Generic_Client_DS_{ds_ratio}.csv'
test_identifiers_path = f'/home/sagemaker-user/Product/test_identifiers_Generic_Client_DS_{ds_ratio}.csv'

In [None]:
# Save feature data for modelling (excluding IDENTIFIER and TVH_PARTITION)
# Shuffle training - randomly sample all rows in the undersampled training set, then reset the index, then drop the original index, 
# otherwise it will be a separate column in each dataset.
train_undersampled_shuffled = train_undersampled.sample(frac=1).reset_index(drop=True)

train_undersampled_shuffled.drop(['IDENTIFIER', 'TVH_PARTITION'], axis=1).to_csv(train_path, index=False)
validation_df.drop(['IDENTIFIER', 'TVH_PARTITION'], axis=1).to_csv(validation_path, index=False)
test_df.drop(['IDENTIFIER', 'TVH_PARTITION'], axis=1).to_csv(test_path, index=False)

# Save only the identifiers and target variable for later use
train_identifiers = train_undersampled_shuffled[['IDENTIFIER', 'TVH_PARTITION', 'MODEL_TARGET']]
validation_identifiers = validation_df[['IDENTIFIER', 'TVH_PARTITION', 'MODEL_TARGET']]
test_identifiers = test_df[['IDENTIFIER', 'TVH_PARTITION', 'MODEL_TARGET']]

# Choose appropriate paths for saving the identifiers
train_identifiers.to_csv(train_identifiers_path, index=False)
validation_identifiers.to_csv(validation_identifiers_path, index=False)
test_identifiers.to_csv(test_identifiers_path, index=False)

#### OPTIONAL (run if you want to load train, val and test without first running all previous steps, except for installs, imports and instantiation of global variables.)

##### Run if you want to load downsampled data

In [None]:
# Define local file paths
train_path = f'/home/sagemaker-user/Product/train_Generic_Client_DS_{ds_ratio}.csv'
validation_path = f'/home/sagemaker-user/Product/validation_Generic_Client_DS_{ds_ratio}.csv'
test_path = f'/home/sagemaker-user/Product/test_Generic_Client_DS_{ds_ratio}.csv'

##### Run if you want to load original data

In [None]:
# Define local file paths
train_path = f'/home/sagemaker-user/Product/train_Generic_Client.csv'
validation_path = f'/home/sagemaker-user/Product/validation_Generic_Client.csv'
test_path = f'/home/sagemaker-user/Product/test_Generic_Client.csv'

In [None]:
# Load the datasets
train_undersampled = pd.read_csv(train_path)
validation_df = pd.read_csv(validation_path)
test_df = pd.read_csv(test_path)

In [None]:
# Define file names
train_file = f'{client_name}/train_Generic_Client_DS_{ds_ratio}.csv'
validation_file = f'{client_name}/validation_Generic_Client_DS_{ds_ratio}.csv'
test_file = f'{client_name}/test_Generic_Client_DS_{ds_ratio}.csv'

In [None]:
# Upload files
pipeline.upload_to_s3(train_path, bucket, train_file)
pipeline.upload_to_s3(validation_path, bucket, validation_file)
pipeline.upload_to_s3(test_path, bucket, test_file)

In [None]:
# Delete df to save memory
del df

# Manually call garbage collector
gc.collect()

#### Preprocessing

In [None]:
# Define local file paths
train_path = f'/home/sagemaker-user/Product/train_Generic_Client_DS_{ds_ratio}.csv'
validation_path = f'/home/sagemaker-user/Product/validation_Generic_Client_DS_{ds_ratio}.csv'
test_path = f'/home/sagemaker-user/Product/test_Generic_Client_DS_{ds_ratio}.csv'

train_identifiers_path = f'/home/sagemaker-user/Product/train_identifiers_Generic_Client_DS_{ds_ratio}.csv'
validation_identifiers_path = f'/home/sagemaker-user/Product/validation_identifiers_Generic_Client_DS_{ds_ratio}.csv'
test_identifiers_path = f'/home/sagemaker-user/Product/test_identifiers_Generic_Client_DS_{ds_ratio}.csv'

In [None]:
print("Loading training, validation and test data...")
train_data = pd.read_csv(train_path)
validation_data = pd.read_csv(validation_path)
test_data = pd.read_csv(test_path)
print("Data loaded.")

In [None]:
print("Defining column types...")
numeric_columns = train_data.drop('MODEL_TARGET', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = train_data.drop('MODEL_TARGET', axis=1).select_dtypes(include=['object', 'category']).columns.tolist()
print("Column types defined.")
print(f'Number of numeric columns: {len(numeric_columns)} \n Numeric columns: {numeric_columns})')
print(f'Number of categorical columns: {len(categorical_columns)} \n categorical columns: {categorical_columns})')

In [None]:
# Debugging step 2: Ensure 'MODEL_TARGET' is not dropped
if 'MODEL_TARGET' not in train_data.columns:
    raise ValueError("'MODEL_TARGET' column not found in training data")
if 'MODEL_TARGET' not in validation_data.columns:
    raise ValueError("'MODEL_TARGET' column not found in validation data")

In [None]:
print("Preparing training data...")
X_train = train_data.drop('MODEL_TARGET', axis=1)
y_train = train_data['MODEL_TARGET']
print(f"Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}")

In [None]:
print("Preparing validation data...")
X_validation = validation_data.drop('MODEL_TARGET', axis=1)
y_validation = validation_data['MODEL_TARGET']
print(f"Shape of X_validation: {X_validation.shape}, Shape of y_validation: {y_validation.shape}")

In [None]:
print("Preparing test data...")
X_test = test_data.drop('MODEL_TARGET', axis=1)
y_test = test_data['MODEL_TARGET']
print(f"Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}")

### Pipeline from model_pipeline.py

In [None]:
# Transform X_train using the preprocessor
def preprocess_multi_model(algorithm: str, data: pd.DataFrame, preprocessor: sklearn.compose._column_transformer.ColumnTransformer, partition: str):
    if partition == 'train': # then fit the preprocessor to the data before transforming it
        logger.info(f'Preprocessing training data for {algorithm}.')   
        if algorithm == 'catboost':
            # Separate X_train into numerical and categorical data
            data_numeric = data[numeric_columns]
            data_categorical = data[categorical_columns]

            # Access numeric transformer from preprocessor
            numeric_transformer = preprocessor.named_transformers_['num']
            # Apply preprocessing to numerical data
            data_numeric_preprocessed = numeric_transformer.fit_transform(data_numeric)

            # Convert categorical data to strings if they are not already
            data_categorical = data_categorical.astype(str)

            # Combine preprocessed numerical and categorical data
            data_combined = np.hstack([data_numeric_preprocessed, data_categorical])

            # The categorical features are now the last columns in X_train_combined
            # So, return the indices of the features occuring after the numeric features in ascending order 
            cat_features_indices = list(range(data_numeric_preprocessed.shape[1], data_combined.shape[1]))
            logger.info('Data preprocessed.')
            return data_combined, cat_features_indices 
        else:
            data_preprocessed = preprocessor.fit_transform(data)
            logger.info('Data preprocessed.')
            return data_preprocessed, None # Return two variables to ensure consistency with CatBoost option
    else: # for validation and test sets, just transform the data
        logger.info(f'Preprocessing {partition} data for {algorithm}.') 
        if algorithm == 'catboost':
            # Separate X_train into numerical and categorical data
            data_numeric = data[numeric_columns]
            data_categorical = data[categorical_columns]

            # Access numeric transformer from preprocessor
            numeric_transformer = preprocessor.named_transformers_['num']
            # Apply preprocessing to numerical data
            data_numeric_preprocessed = numeric_transformer.transform(data_numeric)

            # Convert categorical data to strings if they are not already
            data_categorical = data_categorical.astype(str)

            # Combine preprocessed numerical and categorical data
            data_combined = np.hstack([data_numeric_preprocessed, data_categorical])

            # The categorical features are now the last columns in X_train_combined
            # So, return the indices of the features occuring after the numeric features in ascending order 
            cat_features_indices = list(range(data_numeric_preprocessed.shape[1], data_combined.shape[1]))
            logger.info('Data preprocessed.')
            return data_combined, cat_features_indices 
        else:
            data_preprocessed = preprocessor.transform(data)
            logger.info('Data preprocessed.')
            return data_preprocessed, None # Return two variables to ensure consistency with CatBoost option
        
#print(f"Shape of X_train_preprocessed: {X_train_preprocessed.shape}")

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
print("Defining column types...")
numeric_columns = train_data.drop('MODEL_TARGET', axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = train_data.drop('MODEL_TARGET', axis=1).select_dtypes(include=['object', 'category']).columns.tolist()
print("Column types defined.")
print(f'Number of numeric columns: {len(numeric_columns)} \n Numeric columns: {numeric_columns})')
print(f'Number of categorical columns: {len(categorical_columns)} \n categorical columns: {categorical_columns})')

In [None]:
# Create model pipeline
algorithm = 'logistic_regression'  # Change this to try different algorithms
use_grid_search = False  # Set True to use GridSearchCV, False for standard pipeline

# Generate training pipeline
training_pipeline = model_pipeline.create_pipeline(numeric_columns, categorical_columns, algorithm, use_grid_search=use_grid_search)

# Extract the preprocessor from the pipeline
preprocessor = training_pipeline.named_steps['preprocessor']

# Preprocess training data
X_train_preprocessed, cat_features_indices = preprocess_multi_model(algorithm, X_train, preprocessor, 'train')

# Preprocess validation data
X_val_preprocessed, _ = preprocess_multi_model(algorithm, X_validation, preprocessor, 'validation')

# Preprocess validation data
X_test_preprocessed, _ = preprocess_multi_model(algorithm, X_test, preprocessor, 'test')

# Extract the classifier from the pipeline
classifier = training_pipeline.named_steps['classifier']
#classifier = training_pipeline # for GridSearchCV

#### Training and Validation (No Hyperparameter Tuning)

In [None]:
# Fit the model
logger.info("Fitting the model...")
if algorithm == 'catboost':
    classifier.fit(X_train_preprocessed, y_train, cat_features=cat_features_indices)
    model_pipeline.save_model(classifier, algorithm=algorithm, run=1)
else:
    classifier.fit(X_train_preprocessed, y_train)
    model_pipeline.save_model(classifier, algorithm=algorithm, run=1)
    
# Evaluate the model on the validation set
logger.info("Making predictions on validation data...")
y_pred_validation = classifier.predict(X_val_preprocessed)

# Compute and display metrics
accuracy = accuracy_score(y_validation, y_pred_validation)
roc_auc = roc_auc_score(y_validation, y_pred_validation)
precision = sk_precision_score(y_validation, y_pred_validation)
recall = recall_score(y_validation, y_pred_validation)
f1 = f1_score(y_validation, y_pred_validation)

logger.info(f"Accuracy: {accuracy}")
logger.info(f"ROC AUC: {roc_auc}")
logger.info(f"Precision: {precision}")
logger.info(f"Recall: {recall}")
logger.info(f"F1 Score: {f1} \n")

# Detailed classification report
report = classification_report(y_validation, y_pred_validation)
print("Classification Report for Validation Data:")
print(report)

# Generate the confusion matrix
# sklearn confusion matrix layout:
# TN  FP
# FN  TP
cm = confusion_matrix(y_validation, y_pred_validation)
# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_validation, y_pred_validation)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Compute Precision-Recall curve and average precision
# Precision-Recall curve robust with imbalanced datasets!
precision, recall, _ = precision_recall_curve(y_validation, classifier.predict(X_val_preprocessed))
average_precision = average_precision_score(y_validation, classifier.predict(X_val_preprocessed))

# Plot the Precision-Recall curve
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve: AP={average_precision:0.2f}')
plt.show()

logger.info(f"Average Precision: {average_precision:.2f}")

print("-" * 20)

logger.info("Model training and evaluation completed successfully.")


#### Training and Validation (Hyperparameter Tuning)

In [None]:
# Define hyperparameter grid for each model
algorithm = 'logistic_regression'

# For classifiers with "scale_pos_weight" argument, compute "scale_pos_weight".
# This argument helps the model handle class imbalances by giving more weight to fraud cases.
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

hyperparameter_grid = {
    "random_forest": {
        "n_estimators": [1500],
        "max_depth": [10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "xgb": {
        "max_depth": [6, 10],
        "learning_rate": [0.01, 0.05],
        "n_estimators": [1500],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [scale_pos_weight]
    },
    "catboost": {
        "depth": [6, 10],
        "learning_rate": [0.01, 0.1],
        "iterations": [1000, 1200]
    },
    "logistic_regression": {
        "C": [0.01, 0.1, 0.5, 1.0, 5.0],
        "solver": ["liblinear", "lbfgs", "saga"],
        "penalty": ["l2"],
        "class_weight": ["balanced"],
        "max_iter": [200, 500]
    },
    "svm": {
        "C": [0.1, 1.0],
        "kernel": ["rbf"],
        "gamma": ["scale"],
        "max_iter": [1000, 2000]
    },
    "neural_network": {
        "hidden_layer_sizes": [(256, 128), (128, 64)],
        "alpha": [0.0001, 0.001],
        "max_iter": [500, 1000]
    },
    "light_gbm": {
        "objective": ["binary"],
        "num_leaves": [31],               # Controls complexity
        "max_depth": [10, 20],            # -1 means no limit
        "learning_rate": [0.01, 0.05],   # Typical values
        "n_estimators": [1500],          # # of boosting rounds
        "min_child_samples": [20, 40],        # Minimum data in a leaf
        "subsample": [0.8, 1.0],              # Row sampling (bagging)
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [scale_pos_weight]
        #"reg_alpha": [0.0, 0.1],              # L1 regularization
        #"reg_lambda": [0.0, 0.1]              # L2 regularization
    }
}

# Get hyperparameter combinations for selected model
if algorithm in hyperparameter_grid:
    param_combinations = list(product(*hyperparameter_grid[algorithm].values()))
    param_names = list(hyperparameter_grid[algorithm].keys())
else:
    raise ValueError(f"No hyperparameter grid found for {algorithm}")

# Track best model performance
best_model = None
best_score = -np.inf
best_params = None

# Iterate over all hyperparameter combinations
for param_values in param_combinations:
    params = dict(zip(param_names, param_values))
    logger.info(f"Training with parameters: {params}")

    # Instantiate model with current hyperparameters
    if algorithm == 'random_forest':
        classifier = model_pipeline.CustomRandomForest(**params)
    elif algorithm == 'xgb':
        classifier = xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric="logloss", random_state=42)
    elif algorithm == 'catboost':
        classifier = cb.CatBoostClassifier(**params, verbose=False, random_state=42)
    elif algorithm == 'logistic_regression':
        classifier = model_pipeline.CustomLogisticRegression(**params)
    elif algorithm == 'svm':
        classifier = model_pipeline.CustomSVM(**params)
    elif algorithm == 'neural_network':
        classifier = model_pipeline.CustomNeuralNetwork(**params)
    elif algorithm == 'light_gbm':
        classifier = LGBMClassifier(**params)
    else:
        raise ValueError(f"Unsupported algorithm: {algorithm}")

    # Fit the model
    logger.info("Fitting the model...")
    if algorithm == 'catboost':
        classifier.fit(X_train_preprocessed, y_train, cat_features=cat_features_indices)
    else:
        classifier.fit(X_train_preprocessed, y_train)

    # Evaluate on validation set
    logger.info("Making predictions on validation data...")
    y_pred_validation = classifier.predict(X_val_preprocessed)
    y_pred_proba = classifier.predict_proba(X_val_preprocessed)[:, 1]  # Use probabilities for AUC-ROC

    # Compute metrics
    accuracy = accuracy_score(y_validation, y_pred_validation)
    roc_auc = roc_auc_score(y_validation, y_pred_proba)
    precision = sk_precision_score(y_validation, y_pred_validation)
    recall = recall_score(y_validation, y_pred_validation)
    f1 = f1_score(y_validation, y_pred_validation)

    logger.info(f"Results for {params} - Accuracy: {accuracy}, ROC AUC: {roc_auc}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

    # Track best model using ROC-AUC
    if roc_auc > best_score:
        best_score = roc_auc
        best_model = classifier
        best_params = params

# Save best model
logger.info(f"Best parameters found: {best_params}")
model_pipeline.save_model(best_model, algorithm=algorithm, run=1)

# Evaluate best model on validation set
logger.info("Evaluating best model on validation set...")
y_pred_validation = best_model.predict(X_val_preprocessed)
y_pred_proba = best_model.predict_proba(X_val_preprocessed)[:, 1]

# Compute and display metrics for best model
accuracy = accuracy_score(y_validation, y_pred_validation)
roc_auc = roc_auc_score(y_validation, y_pred_proba)
precision = sk_precision_score(y_validation, y_pred_validation)
recall = recall_score(y_validation, y_pred_validation)
f1 = f1_score(y_validation, y_pred_validation)

logger.info(f"Best Model - Accuracy: {accuracy}, ROC AUC: {roc_auc}, Precision: {precision}, Recall: {recall}, F1 Score: {f1} \n")

# Print classification report
report = classification_report(y_validation, y_pred_validation)
print("Classification Report for Validation Data:")
print(report)

# Display confusion matrix
cm = confusion_matrix(y_validation, y_pred_validation)
print("Confusion Matrix:")
print(cm)

# Compute and plot ROC curve
fpr, tpr, _ = roc_curve(y_validation, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Compute and plot Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_validation, y_pred_proba)
average_precision = average_precision_score(y_validation, y_pred_proba)

plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(f'Precision-Recall curve: AP={average_precision:0.2f}')
plt.show()

logger.info(f"Average Precision: {average_precision:.2f}")
logger.info("Model training and evaluation completed successfully.")

### Test

In [None]:
# Path to the 'models' folder
models_folder = '/home/sagemaker-user/Product'

In [None]:
# Loop through the model files in the 'models' folder
for model_file in os.listdir(models_folder):
    if model_file.endswith('model_logistic_regression_1_2025-04-22 09:58:38.pkl'):
        model_path = os.path.join(models_folder, model_file)
        if algorithm == 'catboost':
            # Load the CatBoost model
            catboost_model = cb.CatBoostClassifier()
            catboost_model.load_model(model_path)
            # Make predictions on the test data
            y_pred_prob_test = catboost_model.predict_proba(X_test_preprocessed)
            # y_pred_prob_test will have two columns: one for the probability of class 0, and the other for class 1
            # To get the probability of class 1, select the second column
            y_pred_test = y_pred_prob_test[:, 1]
        
        if algorithm == 'xgb':
            # Load the model
            bst = xgb.Booster(model_file=model_path)
            # Make predictions on the test data
            y_pred_test = bst.predict(xgb.DMatrix(X_test_preprocessed))
            
        else: # has to be an sklearn model, saved using pickle
            model = model_pipeline.load_model(model_path, algorithm)
            # Make predictions
            y_pred_test = model_pipeline.make_predictions(model, algorithm, X_test_preprocessed)

        # Convert the predictions to binary values (0 or 1)
        y_pred_rounded = np.round(y_pred_test)

        print(f"Model: {model_file}")
        
        # Compute and display metrics
        accuracy = accuracy_score(y_test, y_pred_rounded)
        roc_auc = roc_auc_score(y_test, y_pred_rounded)
        precision = sk_precision_score(y_test, y_pred_rounded)
        recall = recall_score(y_test, y_pred_rounded)
        f1 = f1_score(y_test, y_pred_rounded)

        print(f"Accuracy: {accuracy}")
        print(f"ROC AUC: {roc_auc}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")
        
        # Get the classification report
        print("Classification Report:")
        print(classification_report(y_test, y_pred_rounded))
        
        # Generate the confusion matrix
        cm = confusion_matrix(y_test, y_pred_rounded)

        # Display the confusion matrix
        print("Confusion Matrix:")
        print(cm)
        
        # Compute ROC curve and ROC area
        fpr, tpr, _ = roc_curve(y_test, y_pred_rounded)
        roc_auc = auc(fpr, tpr)

        # Plot the ROC curve
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.show()
        
        # Convert the predictions array into a DataFrame
        df_preds = pd.DataFrame(y_pred_test, columns=['Prediction'])
        
        # Join predictions to enquiry keys
        # Enquiry keys, model target and tvh partition contained in test_identifiers_df
        test_identifiers_path = f'/home/sagemaker-user/Product/test_identifiers_Generic_Client_DS_{ds_ratio}.csv'
        test_identifiers_df = pd.read_csv(test_identifiers_path)

        # Saving the combined DataFrame
        results_df.to_csv(f'/home/sagemaker-user/Product/Generic_Client_Output_{model_file}.csv', index=False)
        print("-" * 40)

### Logistic Regression Coefficients

In [None]:
# Get feature names for preprocessed features
feature_names = get_transformed_feature_names(preprocessor)

In [None]:
# Get coefficients from best logistic regression model
coefficients = model.model.coef_[0]  # shape: (n_features,)

In [None]:
coefficients

In [None]:
# Create DataFrame of feature importance
coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients,
    "AbsCoefficient": np.abs(coefficients)
})

In [None]:
# Sort by absolute value
coef_df_sorted = coef_df.sort_values(by="AbsCoefficient", ascending=False)
coef_df_sorted

In [None]:
top_n = 20  # Show top 20 features
top_features = coef_df_sorted.head(top_n)

plt.figure(figsize=(10, 6))
bars = plt.barh(top_features["Feature"], top_features["Coefficient"])
plt.axvline(x=0, color='black', linestyle='--')
plt.title("Top 20 Logistic Regression Coefficients (Impact on Fraud Likelihood)")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### SHAP

In [None]:
def get_transformed_feature_names(column_transformer):
    """
    Extract transformed feature names from a fitted ColumnTransformer.
    Handles pipelines and transformers with get_feature_names_out.

    Args:
        column_transformer (ColumnTransformer): Fitted ColumnTransformer.

    Returns:
        List[str]: List of transformed feature names.
    """
    feature_names = []

    for name, transformer, original_features in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'passthrough':
            feature_names.extend(original_features)
            continue

        # Check if it's a pipeline
        if isinstance(transformer, Pipeline):
            # Iterate steps in reverse to find the first with get_feature_names_out
            for step_name, step_transformer in reversed(transformer.steps):
                if hasattr(step_transformer, 'get_feature_names_out'):
                    if isinstance(step_transformer, OneHotEncoder):
                        names = step_transformer.get_feature_names_out(original_features)
                    else:
                        names = step_transformer.get_feature_names_out()
                    feature_names.extend(names)
                    break
            else:
                # No transformer in pipeline supports get_feature_names_out
                feature_names.extend(original_features)
        elif hasattr(transformer, 'get_feature_names_out'):
            # Transformer supports get_feature_names_out directly
            feature_names.extend(transformer.get_feature_names_out(original_features))
        else:
            # Fallback to original feature names
            feature_names.extend(original_features)

    return feature_names

In [None]:
# Get feature names for preprocessed features
feature_names = get_transformed_feature_names(preprocessor)

In [None]:
# --- SHAP works best on raw, unscaled features ---
# Use X_train_preprocessed *only if* preprocessing is minimal
# Otherwise use original features: X_train, X_validation

# As X_val_preprocessed is numpy array, construct a DataFrame with correct column names and index
X_explain = pd.DataFrame(X_val_preprocessed, columns=feature_names, index=X_validation.index)

In [None]:
X_explain.head()

In [None]:
explainer = shap.Explainer(bst, X_explain) # change 'bst' to be model agnostic
# Compute SHAP values
shap_values = explainer(X_explain)

In [None]:
# --- Global Summary Plot ---
shap.summary_plot(shap_values, X_explain, show=True)

In [None]:
# Use this when you want to rank features by importance (average impact)
shap.plots.bar(shap_values)

#### Explain One Fraud Case

In [None]:
# For one fraud case
fraud_index = y_validation[y_validation == 1].index[0]

# Show feature contributions using a waterfall plot (no JavaScript)
shap.plots.waterfall(shap_values[fraud_index])