In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib

In [None]:
# Set visualization styles
sns.set_style("whitegrid")
print("Libraries imported successfully.")

Load the Dataset

In [None]:
try:
    df = pd.read_csv('../data/raw/PS_20174392719_1491204439457_log.csv')
    print("Dataset loaded successfully.")
    print(f"Original dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: Dataset file not found. Make sure it's in the 'ml/data/raw/' directory.")
    df = None

Preprocessing and Feature Engineering

In [None]:
if df is not None:
    # Based on EDA, fraud only occurs in 'TRANSFER' and 'CASH_OUT'
    df_filtered = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])].copy()
    print(f"Shape after filtering for transaction types: {df_filtered.shape}")

    # One-hot encode the 'type' column
    df_filtered = pd.get_dummies(df_filtered, columns=['type'], prefix='type', drop_first=True)

    # Define features (X) and target (y)
    X = df_filtered.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
    y = df_filtered['isFraud']

    print("\nFeatures selected for the model:")
    print(X.columns.tolist())
    print(f"\nFeatures shape: {X.shape}")
    print(f"Target shape: {y.shape}")

Split Data into Training and Testing Sets

In [None]:
if 'X' in locals():
    # Split the data before applying SMOTE
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print("Data split into training and testing sets.")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")

Handle Class Imbalance with SMOTE

In [None]:
if 'X_train' in locals():
    print("\nOriginal training set distribution:")
    print(y_train.value_counts())

    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    print("\nResampled training set distribution:")
    print(y_train_resampled.value_counts())

Train the XGBoost Model

In [None]:
if 'X_train_resampled' in locals():
    # Initialize the XGBoost classifier
    # scale_pos_weight is useful for imbalanced classes, but we've used SMOTE.
    # We can still use it for extra emphasis on the positive class.
    # scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr', # Area Under Precision-Recall Curve
        use_label_encoder=False,
        random_state=42
    )

    print("\nTraining the XGBoost model...")
    model.fit(X_train_resampled, y_train_resampled)
    print("Model training completed.")


Evaluate the Model 

In [None]:
if 'model' in locals():
    print("\nEvaluating model performance on the test set...")
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)

    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.4f}')
    plt.title('Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.show()

    print(f"\nArea Under Precision-Recall Curve (PR AUC): {pr_auc:.4f}")

Feature Importance 

In [None]:
if 'model' in locals():
    plt.figure(figsize=(12, 8))
    xgb.plot_importance(model, height=0.8, max_num_features=10)
    plt.title('XGBoost Feature Importance')
    plt.show()

In [None]:
Save the Model

In [None]:
if 'model' in locals():
    model_path = '../models/xgboost_model.pkl'
    joblib.dump(model, model_path)
    print(f"\nModel saved successfully to {model_path}")
    
    # Also save the columns to ensure consistency during prediction
    columns_path = '../models/model_columns.pkl'
    joblib.dump(X.columns.tolist(), columns_path)
    print(f"Model columns saved successfully to {columns_path}")