# Model devepment

In this document we develop and compare different models for our model devepment. We have the following sections:

1. Model creation
2. Model evaluation
3. Model implementation on test data

Note that for model creation instead of running the code each time one can load the best model.



### Import libraries

In [4]:
from preprocessing import preprocessor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import uniform, randint
import joblib

### Data collection

In [None]:
# Load the dataset
total_df = pd.read_csv('../Data/Base.csv')

# Define features (X) and target (y)
X = total_df.drop(columns=['fraud_bool'])
y = total_df['fraud_bool']

# Split the data into training and test sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Data has been loaded")

# Apply the preprocessor to the training and test datasets
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


Data has been loaded


## 1. Model Creation

### 1a. Hyperparameter specification

We decided to use Logistic Regression, Random Forests, Support Vector Classifier, KNN, Gradient Boosting, XGBoost, LightGBM and Naive Bayes. The following hyperparameters were what was decided to be best.

In [44]:
# Pipeline with placeholder classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())])

# Define models and their hyperparameters
models = {
    # 'Logistic Regression': (
    #     LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
    #     {'classifier__C': uniform(0.01, 10)}
    #  ), 
    # 'Random Forest': (
    #     RandomForestClassifier(class_weight='balanced', random_state=42),
    #     {
    #         'classifier__n_estimators': randint(50, 150),
    #         'classifier__max_depth': randint(3, 10)
    #     }
    # ),
    # 'SVC': (
    #     SVC(class_weight='balanced', probability=True, random_state=42),
    #     {
    #         'classifier__C': uniform(0.01, 10),
    #         'classifier__kernel': ['linear', 'rbf']
    #     }
    # ),
    # 'KNN': (
    #     KNeighborsClassifier(),
    #     {'classifier__n_neighbors': randint(3, 10)}
    # ),
    # 'Gradient Boosting': (
    #     GradientBoostingClassifier(random_state=42),
    #     {
    #         'classifier__n_estimators': randint(50, 150),
    #         'classifier__learning_rate': uniform(0.01, 0.2)
    #     }
    # ),
    'XGBoost': (
        XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', tree_method='gpu_hist'),
        {
            'classifier__n_estimators': randint(50, 150),
            'classifier__learning_rate': uniform(0.01, 0.2),
            'classifier__max_depth': randint(3, 10),
            'classifier__gamma': uniform(0, 0.5),
        }
    ),
    'LightGBM': (
        LGBMClassifier(random_state=42, device='gpu'),
        {
            'classifier__n_estimators': randint(50, 150),
            'classifier__learning_rate': uniform(0.01, 0.2),
            'classifier__max_depth': randint(3, 10),
            'classifier__num_leaves': randint(20, 50),
        }
    ),
    # 'Naive Bayes': (
    #     GaussianNB(),
    #     {}  # No hyperparameters for Naive Bayes
    # )
}

### 1b. Hyperparameter searching 

The next step is to do the hyperparameter search and we decided to do 3 random searches per model to keep the time complexity low. This code takes a lot of time to run but is a sacrifice our computers are willing to take.

### (WARNING: DON'T RUN CELL, LOAD SEARCH INSTEAD!!!)

In [45]:
import time
import joblib
import os
from datetime import datetime

# Create directory for saving models if it doesn't exist
os.makedirs('saved_models', exist_ok=True)

# Stratified K-Fold Cross-Validation
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Number of random searches per model
n_iter_per_model = 50  # Adjust as needed
best_models = []

# Dictionary to store all RandomizedSearchCV objects
search_results = {}

# Iterate through each model
version_counter = 1
for name, (model, params) in models.items():
    print(f"\n--- Starting RandomizedSearchCV for {name} (Version {version_counter}) ---\n")
    
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    # Create RandomizedSearchCV
    search = RandomizedSearchCV(
        pipeline, 
        param_distributions=params,
        n_iter=n_iter_per_model,
        cv=stratified_cv,
        n_jobs=-1,
        random_state=42,
        scoring='roc_auc',
        verbose=3
    )

    # Start timing
    start_time = time.time()

    # Fit the model
    try:
        print(f"Fitting the model {name} with {n_iter_per_model} iterations and {stratified_cv.get_n_splits()} cross-validation splits...")
        search.fit(X_train, y_train)
    except Exception as e:
        print(f"An error occurred while fitting {name}: {e}")
        continue

    # End timing
    end_time = time.time()

    # Calculate elapsed time
    elapsed_time = end_time - start_time
    elapsed_time_str = f"{elapsed_time:.2f}s"
    print(f"\n--- Finished RandomizedSearchCV for {name} in {elapsed_time_str} ---\n")

    # Store the search object in the dictionary
    search_results[name] = search

    # Get current date and time for naming
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Create a unique filename with version number, model name, date, and training time
    filename = f"saved_models/{name}_v{version_counter}_{current_datetime}_{elapsed_time_str}.joblib"
    
    # Save each RandomizedSearchCV object immediately after training
    print(f"Saving RandomizedSearchCV results for {name} as {filename}...\n")
    joblib.dump(search, filename)
    print(f"RandomizedSearchCV for {name} saved successfully.\n")

    # Increment the version number for the next model
    version_counter += 1

print("\n--- All Models Processed ---\n")



--- Starting RandomizedSearchCV for XGBoost (Version 1) ---

Fitting the model XGBoost with 50 iterations and 5 cross-validation splits...
Fitting 5 folds for each of 50 candidates, totalling 250 fits



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.




--- Finished RandomizedSearchCV for XGBoost in 334.26s ---

Saving RandomizedSearchCV results for XGBoost as saved_models/XGBoost_v1_2024-11-20_00-23-10_334.26s.joblib...

RandomizedSearchCV for XGBoost saved successfully.


--- Starting RandomizedSearchCV for LightGBM (Version 2) ---

Fitting the model LightGBM with 50 iterations and 5 cross-validation splits...
Fitting 5 folds for each of 50 candidates, totalling 250 fits



    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] Number of positive: 8823, number of negative: 791177
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2307
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 46
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 SUPER, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 22 dense feature groups (18.31 MB) transferred to GPU in 0.013352 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011029 -> initscore=-4.496160
[LightGBM] [Info] Start training from score -4.496160

--- Finished RandomizedSearchCV for LightGBM in 681.50s ---

Saving RandomizedSearchCV results for LightGBM as saved_models/LightGBM_v2_2024-11-20_00-34-32_681.50s.joblib...

RandomizedSearchCV for LightGBM saved successfully.


--- All Models Processed ---



# 1b.5 AutoML

### 1c. Search saving and loading

Save search:

In [46]:
# Define the directory containing the saved joblib files
saved_models_dir = 'saved_models'

# Initialize the final search_results dictionary to store all models
search_results = {}

# Iterate over each joblib file in the directory
for filename in os.listdir(saved_models_dir):
    if filename.endswith(".joblib"):
        # Construct the full path to the file
        filepath = os.path.join(saved_models_dir, filename)

        # Load the model information from the file
        model_info = joblib.load(filepath)

        # Use the filename (without extension) as the key in the search_results dictionary
        model_key = filename.replace('.joblib', '')
        
        # Add the loaded model information to the search_results dictionary
        search_results[model_key] = model_info

# Save the combined search_results dictionary to a new joblib file
combined_filename = 'search_results.joblib'
joblib.dump(search_results, combined_filename)
print(f"Combined search results saved successfully as {combined_filename}")


Combined search results saved successfully as search_results.joblib



    E.g. tree_method = "hist", device = "cuda"



Load search:

In [42]:
search_results = joblib.load(f"search_results.joblib")

## 2. Model evaluation

### 2a. Random search evaluation

Due to the biased data set we use the auc roc score to evaluate different models. We start by printing the best model of each model class for our search. 

In [47]:
# Create a list for storing model information
results_summary = []

# Collect the best model, score, and parameters
for model_name, search in search_results.items():
    best_score = search.best_score_
    best_params = search.best_params_
    results_summary.append({
        'Model': model_name,
        'Best Score (AUC)': f"{best_score:.4f}",
        'Best Parameters': best_params
    })

# Convert to a DataFrame and sort by AUC score
results_df = pd.DataFrame(results_summary).sort_values(by='Best Score (AUC)', ascending=False)

# Display the DataFrame in Jupyter
from IPython.display import display

display(results_df)

Unnamed: 0,Model,Best Score (AUC),Best Parameters
2,LightGBM_v2_2024-11-20_00-34-32_681.50s,0.8903,{'classifier__learning_rate': 0.17043939615080...
7,XGBoost_v1_2024-11-20_00-23-10_334.26s,0.8903,"{'classifier__gamma': 0.018443473677266398, 'c..."
8,XGBoost_v3_2024-11-20_00-08-29_19.72s,0.8882,{'classifier__learning_rate': 0.08490802376947...
3,LightGBM_v4_2024-11-20_00-08-53_24.18s,0.8854,{'classifier__learning_rate': 0.08490802376947...
0,Gradient Boosting_v2_2024-11-20_00-08-09_585.78s,0.8847,{'classifier__learning_rate': 0.08490802376947...
4,Logistic Regression_search_results,0.8729,{'classifier__C': 9.51714306409916}
6,Random Forest_search_results,0.8691,"{'classifier__max_depth': 9, 'classifier__n_es..."
5,Naive Bayes_v5_2024-11-20_00-09-05_11.48s,0.8371,{}
1,KNN_v1_2024-11-19_23-58-23_500.97s,0.6448,{'classifier__n_neighbors': 9}


We can see that the best performing model on the trainig data is "TODO" with auc_roc score being "TODO". Now we will evaluate this model closer in the following section

### 2b. Best model evaluation

We begin the evaluation of the best model by extracting it from search_results.

In [9]:
# Initialize variables to track the best model
best_model_name = None
best_model_score = -float('inf')
best_model_params = None
best_model_object = None
best_classifier = None

# Iterate through the search results to find the best model
for model_name, search in search_results.items():
    if search.best_score_ > best_model_score:
        best_model_name = model_name
        best_model_score = search.best_score_
        best_model_params = search.best_params_
        best_model_object = search.best_estimator_

        # Extract the classifier from the pipeline
        best_classifier = best_model_object.named_steps['classifier']

# Print the best model details
print(f"Best Model Name: {best_model_name}")
print(f"Best Model Score (AUC): {best_model_score:.4f}")
print(f"Best Model Parameters: {best_model_params}")

# Print the best classifier object
print(f"Best Classifier Object: {best_classifier}")


Best Model Name: Logistic Regression
Best Model Score (AUC): 0.8729
Best Model Parameters: {'classifier__C': 3.7554011884736247}
Best Classifier Object: LogisticRegression(C=3.7554011884736247, class_weight='balanced', max_iter=1000,
                   random_state=42)


Next we evaluate the model on the training data. This gives:

## 3. Test data evaluation

In [None]:
# Evaluate the best model
y_pred = best_classifier.predict(X_test)
y_pred_proba = best_classifier.predict_proba(X_test)[:, 1] if hasattr(best_model_name.named_steps['classifier'], 'predict_proba') else y_pred

# Print the best model and its parameters
print(f"\nBest Model: {best_model_name}")
print(f"Best Cross-Validation AUC Score: {best_score:.4f}")

# Print classification report
print(classification_report(y_test, y_pred))

# Calculate and print AUC score on the test set
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score on Test Set: {auc_score:.4f}")