<a href="https://colab.research.google.com/github/Chun1225/Imperial-Research-Project/blob/main/RP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install pyg-lib using the recommended method from the error message
# This ensures compatibility with your installed PyTorch and CUDA versions
import torch

# Detect PyTorch and CUDA version
TORCH_VERSION = torch.__version__.split('+')[0]
CUDA_VERSION = torch.version.cuda

if CUDA_VERSION:
    # Format as e.g. 'cu118' or 'cu121'
    CUDA_str = f"cu{CUDA_VERSION.replace('.', '')}"
else:
    # If no CUDA, use cpu version
    CUDA_str = 'cpu'

# Construct and execute the correct installation command
# !pip is the syntax to run shell commands in Jupyter/Colab environments
!pip install pyg-lib -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+{CUDA_str}.html

In [None]:
pip install category_encoders catboost dataframe_image Selenium torch_geometric

In [None]:
import pandas as pd
import numpy as np
import time
import warnings

import matplotlib.pyplot as plt
import seaborn as sns
import dataframe_image as dfi
from IPython.display import display, Image

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, average_precision_score, precision_recall_curve

import category_encoders as ce

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import torch
from torch_geometric.data import HeteroData
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear, BatchNorm, GATv2Conv
from torch_geometric.loader import LinkNeighborLoader
from tqdm.auto import tqdm

In [None]:
TRANSACTIONS_PATH = '/content/drive/MyDrive/credit_card_transactions-ibm_v2.csv'
CARDS_INFO_PATH = '/content/drive/MyDrive/sd254_cards.csv'
USERS_INFO_PATH = '/content/drive/MyDrive/sd254_users.csv'

full_df = pd.read_csv(TRANSACTIONS_PATH)
df_trans = full_df.sample(n=1500000, random_state=335)
df_cards = pd.read_csv(CARDS_INFO_PATH)
df_users = pd.read_csv(USERS_INFO_PATH)

# merge

df_trans['Amount'] = df_trans['Amount'].replace({'\$': ''}, regex=True).astype(float)
df_trans['Is Fraud?'] = df_trans['Is Fraud?'].apply(lambda x: 1 if x == 'Yes' else 0)

df_card_owner_profile = pd.merge(
    df_cards,
    df_users,
    on='User',
    how='left'
)

df_full_context = pd.merge(
    df_trans,
    df_card_owner_profile,
    on=['User', 'Card'],
    how='left',
    suffixes=('_trans', '')
)

print("dim:", df_full_context.shape)

display(df_full_context.head())
print("Info: ")
df_full_context.info(verbose=True, show_counts=True)

In [None]:
if 'df_full_context' not in locals():
    print("without df_full_context。")
else:
    sns.set_style("whitegrid")

    plt.figure(figsize=(8, 6))

    ax = sns.countplot(x='Is Fraud?', data=df_full_context, palette=['#66b3ff', '#ff9999'])

    ax.set_title('Distribution of Fraud vs Non-Fraud Samples', fontsize=16, pad=20)
    ax.set_xlabel('Class', fontsize=12)
    ax.set_ylabel('Number of Transactions', fontsize=12)

    ax.set_xticklabels(['Non-Fraud (0)', 'Fraud (1)'])

    for p in ax.patches:
        ax.annotate(f'{int(p.get_height()):,}',
                    (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center',
                    va='center',
                    xytext=(0, 9),
                    textcoords='offset points',
                    fontsize=11)

    plt.show()

In [None]:
def split_composite_errors(error_string: str) -> pd.Series:
    error_string = str(error_string).strip()
    if ',' in error_string:
        parts = error_string.split(',', 1)
        return pd.Series([parts[0].strip(), parts[1].strip()])
    else:
        return pd.Series([error_string, 'None'])

def create_final_feature_set(df: pd.DataFrame) -> pd.DataFrame:

    df_processed = df.copy()

    cols_to_clean_numeric = ['Credit Limit', 'Yearly Income - Person', 'Total Debt', 'Per Capita Income - Zipcode']
    for col in cols_to_clean_numeric:
        if col in df_processed.columns:
            df_processed[col] = pd.to_numeric(df_processed[col].astype(str).str.replace(r'[$,]', '', regex=True), errors='coerce')
    if 'Amount' in df_processed.columns:
        df_processed['Amount'] = df_processed['Amount'].abs()
    id_cols_to_str = ['User', 'Merchant Name', 'Zip', 'MCC', 'Card Brand', 'Card Type', 'Use Chip', 'Has Chip', 'State', 'Merchant State', 'Merchant City', 'City', 'Gender', 'Card on Dark Web']
    for col in id_cols_to_str:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].astype(str)
    if 'Errors?' in df_processed.columns:
        df_processed['Errors?'] = df_processed['Errors?'].fillna('No Error').astype(str)


    if 'User' in df_processed.columns and 'Card' in df_processed.columns:
        df_processed['Card'] = df_processed['User']+ '_' + df_processed['Card'].astype(str)

    if all(c in df_processed.columns for c in ['Year', 'Month', 'Day', 'Time']):
        df_processed['trans_datetime'] = pd.to_datetime(df_processed[['Year', 'Month', 'Day']].astype(str).agg('-'.join, axis=1) + ' ' + df_processed['Time'], errors='coerce')
        df_processed['day_of_week_sin'] = np.sin(2 * np.pi * df_processed['trans_datetime'].dt.dayofweek / 7.0)
        df_processed['day_of_week_cos'] = np.cos(2 * np.pi * df_processed['trans_datetime'].dt.dayofweek / 7.0)
        df_processed['hour_sin'] = np.sin(2 * np.pi * df_processed['trans_datetime'].dt.hour / 24.0)
        df_processed['hour_cos'] = np.cos(2 * np.pi * df_processed['trans_datetime'].dt.hour / 24.0)
        if 'Acct Open Date' in df_processed.columns:
            acct_open_year = pd.to_datetime(df_processed['Acct Open Date'], format='%m/%Y', errors='coerce').dt.year
            df_processed['account_age_at_transaction'] = df_processed['Year'] - acct_open_year
    if 'State' in df_processed.columns and 'Merchant State' in df_processed.columns:
        df_processed['is_in_home_state'] = (df_processed['State'] == df_processed['Merchant State']).astype(int)
    if 'City' in df_processed.columns and 'Merchant City' in df_processed.columns:
        df_processed['is_in_home_city'] = (df_processed['City'] == df_processed['Merchant City']).astype(int)
    if 'Amount' in df_processed.columns and 'Credit Limit' in df_processed.columns:
        df_processed['amount_to_limit_ratio'] = df_processed['Amount'] / df_processed['Credit Limit'].replace(0, 1e-6)
    if 'Amount' in df_processed.columns and 'Yearly Income - Person' in df_processed.columns:
        df_processed['amount_to_personal_income_ratio'] = df_processed['Amount'] / df_processed['Yearly Income - Person'].replace(0, 1e-6)
    if 'Total Debt' in df_processed.columns and 'Yearly Income - Person' in df_processed.columns:
        df_processed['debt_to_income_ratio'] = df_processed['Total Debt'] / df_processed['Yearly Income - Person'].replace(0, 1e-6)
    if 'Amount' in df_processed.columns and 'Per Capita Income - Zipcode' in df_processed.columns:
        df_processed['amount_to_zip_income_ratio'] = df_processed['Amount'] / df_processed['Per Capita Income - Zipcode'].replace(0, 1e-6)
    if 'Errors?' in df_processed.columns:
        df_processed[['Error1', 'Error2']] = df_processed['Errors?'].apply(split_composite_errors)


    final_features_to_keep = [
        'User', 'Card', 'Is Fraud?', 'Amount', 'MCC', 'Card Brand', 'Card Type', 'Has Chip', 'Use Chip',
        'Cards Issued', 'Credit Limit', 'Current Age', 'Gender', 'FICO Score',
        'Num Credit Cards', 'Yearly Income - Person',
        'Merchant Name', 'Merchant State', 'Merchant City', 'State', 'City',
        'Total Debt',
        'Per Capita Income - Zipcode',
        'day_of_week_sin', 'day_of_week_cos', 'hour_sin', 'hour_cos',
        'account_age_at_transaction',
        'is_in_home_state', 'is_in_home_city',
        'amount_to_limit_ratio', 'amount_to_personal_income_ratio',
        'debt_to_income_ratio', 'amount_to_zip_income_ratio',
        'Error1', 'Error2'
    ]

    final_cols_exist = [col for col in final_features_to_keep if col in df_processed.columns]
    df_final = df_processed[final_cols_exist].copy()

    for col in df_final.columns:
        if df_final[col].dtype.name in ['object', 'category']:
            df_final[col] = df_final[col].fillna('Unknown')
        else:
            df_final[col] = df_final[col].fillna(df_final[col].median())

    print(f"dim: {df_final.shape}")
    return df_final

In [None]:
df_final = create_final_feature_set(df_full_context)

print("info: ")
print(df_final.shape)
df_final.info()

In [None]:
print("Cardinality")

categorical_features = df_final.select_dtypes(include=['object']).columns.tolist()

print(f" {len(categorical_features)} Categorical variables：\n")

for col in categorical_features:
    unique_count = df_final[col].nunique()
    print(f"'{col}': {unique_count} unique value")

    if unique_count < 20:
        print(df_final[col].value_counts().to_string())
        print("-" * 30)

In [None]:
TARGET = 'Is Fraud?'
features_df = df_final.drop(columns=[TARGET], errors='ignore')
numerical_features = features_df.select_dtypes(include=np.number).columns.tolist()
categorical_features = features_df.select_dtypes(include=['object', 'category']).columns.tolist()

low_cardinality_features = [col for col in categorical_features if features_df[col].nunique() <= 2]
high_cardinality_features = [col for col in categorical_features if features_df[col].nunique() > 2]

X = df_final.drop(TARGET, axis=1)
y = df_final[TARGET]

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

high_cardinality_encoders = {
    "Ordinal": ce.OrdinalEncoder(handle_unknown='value', handle_missing='value'),
    "Frequency" : ce.CountEncoder(normalize=True, handle_unknown='value', handle_missing='value'),
    "WOE": ce.WOEEncoder(regularization=40, handle_unknown='value', handle_missing='value'),
    "JamesStein": ce.JamesSteinEncoder(model='binary', handle_unknown='value', handle_missing='value'),
    "MEstimate": ce.MEstimateEncoder(m=25),
}

models = {
    "DecisionTree": (DecisionTreeClassifier, {'random_state': 335, 'class_weight': 'balanced'}),
    "RandomForest": (RandomForestClassifier, {'random_state': 335, 'n_jobs': -1, 'class_weight': 'balanced'}),
    "XGBoost": (xgb.XGBClassifier, {'random_state': 335, 'eval_metric': 'aucpr', 'tree_method': 'hist', 'device': 'cuda'}),
    "CatBoost": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU', 'verbose': 0})
}

one_hot_encoder = ce.OneHotEncoder(handle_unknown='value', handle_missing='value', use_cat_names=True)

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring_metrics = {
    'pr_auc': 'average_precision',
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall'
}



results = []

for encoder_name, encoder in high_cardinality_encoders.items():
    for model_name, (model_class, model_params) in models.items():
        start_time = time.time()
        print(f"\n{encoder_name} + {model_name}")

        fold_scores = []
        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            dynamic_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
            current_model_params = model_params.copy()
            if model_name in ["XGBoost"]:
                current_model_params['scale_pos_weight'] = dynamic_scale_pos_weight
            current_model = model_class(**current_model_params)

            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat_low', one_hot_encoder, low_cardinality_features),
                    ('cat_high', Pipeline([
                        ('encoder', encoder),
                        ('scaler', StandardScaler())
                    ]), high_cardinality_features),
                ],
                remainder='passthrough'
            )

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                            ('classifier', current_model)
            ])

            X_train_to_fit = X_train
            y_train_to_fit = y_train

            pipeline.fit(X_train_to_fit, y_train_to_fit)

            y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
            precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
            f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
            best_f1_idx = np.argmax(f1_scores)

            fold_scores.append({
                'PR AUC': average_precision_score(y_val, y_pred_proba),
                'Best F1-Score': f1_scores[best_f1_idx],
                'Best Threshold': thresholds[best_f1_idx] if best_f1_idx < len(thresholds) else 1.0,
                'Precision at Best F1': precisions[best_f1_idx],
                'Recall at Best F1': recalls[best_f1_idx]
            })

        duration = time.time() - start_time
        avg_scores = pd.DataFrame(fold_scores).mean().to_dict()

        results.append({
            "Encoder": encoder_name, "Model": model_name,
            "PR AUC": avg_scores['PR AUC'],
            "F1-Score": avg_scores['Best F1-Score'],
            "Precision": avg_scores['Precision at Best F1'],
            "Recall": avg_scores['Recall at Best F1'],
            "Duration (s)": duration
        })
        print(f"time: {duration:.2f} s, F1-Score: {avg_scores['Best F1-Score']:.4f}")

results_df = pd.DataFrame(results)

results_df.rename(columns={
    "F1-Score": "Best F1-Score",
    "Precision": "Precision at Best F1",
    "Recall": "Recall at Best F1"
}, inplace=True)
column_order = ['Encoder', 'Model', 'PR AUC', 'Best F1-Score', 'Precision at Best F1', 'Recall at Best F1', 'Duration (s)']
results_df_sorted = results_df[column_order].sort_values(by="Best F1-Score", ascending=False).reset_index(drop=True)

print("\nResult: ")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(results_df_sorted)


In [None]:
feature_to_check = "State"

X = df_final.drop(TARGET, axis=1)
y = df_final[TARGET]

train_idx, _ = next(iter(cv_strategy.split(X, y)))
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]

js_encoder = ce.JamesSteinEncoder(model='binary', handle_unknown='value', handle_missing='value')
js_encoded = js_encoder.fit_transform(X_train[[feature_to_check]], y_train)

woe_encoder = ce.WOEEncoder(regularization=40, handle_unknown='value', handle_missing='value')
woe_encoded = woe_encoder.fit_transform(X_train[[feature_to_check]], y_train)

freq_encoder = ce.CountEncoder(normalize=True, handle_unknown='value', handle_missing='value')
freq_encoded = freq_encoder.fit_transform(X_train[[feature_to_check]])

Me_encoder = ce.MEstimateEncoder(m=25)
Me_encoded = Me_encoder.fit_transform(X_train[[feature_to_check]], y_train)

Ordinal_encoder = ce.OrdinalEncoder(handle_unknown='value', handle_missing='value')
Ordinal_encoded = Ordinal_encoder.fit_transform(X_train[[feature_to_check]])

scaler = MinMaxScaler()
js_scaled = scaler.fit_transform(js_encoded[[feature_to_check]].values).flatten()
woe_scaled = scaler.fit_transform(woe_encoded[[feature_to_check]].values).flatten()
freq_scaled = scaler.fit_transform(freq_encoded[[feature_to_check]].values).flatten()
Me_scaled = scaler.fit_transform(Me_encoded[[feature_to_check]].values).flatten()
Ordinal_scaled = scaler.fit_transform(Ordinal_encoded[[feature_to_check]].values).flatten()

encoders_data = {
    'M Estimate': Me_scaled,
    'James-Stein': js_scaled,
    'WOE': woe_scaled,
    'Ordinal': Ordinal_scaled,
    'Frequency': freq_scaled
}

plot_df = pd.DataFrame(encoders_data).melt(var_name='Encoder', value_name='Value')


In [None]:
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

g = sns.FacetGrid(plot_df, row="Encoder", hue="Encoder", aspect=9, height=1.5, palette="deep")

g.map_dataframe(sns.kdeplot, x="Value", fill=True, alpha=0.8)
g.map_dataframe(sns.kdeplot, x="Value", color='black', lw=1)

def label(x, color, label):
    ax = plt.gca()
    ax.text(0, 0.2, label, fontweight="bold", color='black',
                ha="left", va="center", transform=ax.transAxes)

g.map(label, "Value")

g.figure.subplots_adjust(hspace=-0.2)

g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

plt.xlabel('Density functions of the encoded feature "State"', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
print("Feature Important")

encoders_for_importance = {
    "Freq_encoder" : ce.CountEncoder(normalize=True),
    "OrdinalEncoder": ce.OrdinalEncoder(),
    "WOEEncoder": ce.WOEEncoder(),
}

if 'models_to_test' not in locals():
    print("without 'models_to_test'")
else:
    catboost_model_base = models_to_test['CatBoost']

    all_importance_dfs = []

    for encoder_name, encoder in encoders_for_importance.items():
        print(f"\nFeature Important: CatBoost + {encoder_name} {'='*25}")

        pipeline = Pipeline(steps=[
            ('preprocessor', ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numerical_features),
                    ('cat', clone(encoder), categorical_features)
                ],
                remainder='passthrough'
            )),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
            ('classifier', clone(catboost_model_base))
        ])

        pipeline.fit(X, y)

        try:
            feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

            importances = pipeline.named_steps['classifier'].feature_importances_

            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importances
            }).sort_values(by='Importance', ascending=False).reset_index(drop=True)

            importance_df['Encoder'] = encoder_name
            all_importance_dfs.append(importance_df)

            print("Top 20:")
            print(importance_df.head(20).to_string(index=False))


if 'all_importance_dfs' in locals() and all_importance_dfs:
    combined_importance_df = pd.concat(all_importance_dfs, ignore_index=True)

    top_features_df = combined_importance_df.groupby('Encoder').apply(lambda x: x.nlargest(10, 'Importance')).reset_index(drop=True)

    plt.figure(figsize=(15, 10))
    sns.barplot(
        data=top_features_df,
        x='Importance',
        y='Feature',
        hue='Encoder',
        dodge=False
    )
    plt.title('Top 10 Feature Importance for CatBoost with Different Encoders', fontsize=16)
    plt.xlabel('Importance Score', fontsize=12)
    plt.ylabel('Feature Name', fontsize=12)
    plt.legend(title='Encoder')
    plt.tight_layout()
    plt.show()

# Strategy 1

In [None]:
TARGET = 'Is Fraud?'
features_df = df_final.drop(columns=[TARGET], errors='ignore')
numerical_features = features_df.select_dtypes(include=np.number).columns.tolist()
categorical_features = features_df.select_dtypes(include=['object', 'category']).columns.tolist()

low_cardinality_features = [col for col in categorical_features if features_df[col].nunique() <= 15]
high_cardinality_features = [col for col in categorical_features if features_df[col].nunique() > 15]

X = df_final.drop(TARGET, axis=1)
y = df_final[TARGET]

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

high_cardinality_encoders = {
    "Ordinal": ce.OrdinalEncoder(handle_unknown='value', handle_missing='value'),
    "Frequency" : ce.CountEncoder(normalize=True, handle_unknown='value', handle_missing='value'),
    "WOE": ce.WOEEncoder(regularization=40, handle_unknown='value', handle_missing='value'),
    "JamesStein": ce.JamesSteinEncoder(model='binary', handle_unknown='value', handle_missing='value'),
    "MEstimate": ce.MEstimateEncoder(m=25),
}

models = {
    "XGBoost": (xgb.XGBClassifier, {'random_state': 335, 'eval_metric': 'aucpr', 'tree_method': 'hist', 'device': 'cuda'}),
    "CatBoost": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU', 'verbose': 0})
}

one_hot_encoder = ce.OneHotEncoder(handle_unknown='value', handle_missing='value', use_cat_names=True)

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring_metrics = {
    'pr_auc': 'average_precision',
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall'
}

results = []

for encoder_name, encoder in high_cardinality_encoders.items():
    for model_name, (model_class, model_params) in models.items():
        start_time = time.time()
        print(f"\n{encoder_name} + {model_name}...")

        fold_scores = []
        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            dynamic_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
            current_model_params = model_params.copy()
            if model_name in ["XGBoost"]:
                current_model_params['scale_pos_weight'] = dynamic_scale_pos_weight
            current_model = model_class(**current_model_params)

            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat_low', one_hot_encoder, low_cardinality_features),
                    ('cat_high', Pipeline([
                        ('encoder', encoder),
                        ('scaler', StandardScaler())
                    ]), high_cardinality_features),
                ],
                remainder='passthrough'
            )

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                            ('classifier', current_model)
            ])

            X_train_to_fit = X_train
            y_train_to_fit = y_train

            pipeline.fit(X_train_to_fit, y_train_to_fit)

            pipeline.fit(X_train, y_train)
            y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
            precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
            f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
            best_f1_idx = np.argmax(f1_scores)

            fold_scores.append({
                'PR AUC': average_precision_score(y_val, y_pred_proba),
                'Best F1-Score': f1_scores[best_f1_idx],
                'Best Threshold': thresholds[best_f1_idx] if best_f1_idx < len(thresholds) else 1.0,
                'Precision at Best F1': precisions[best_f1_idx],
                'Recall at Best F1': recalls[best_f1_idx]
            })

        duration = time.time() - start_time
        avg_scores = pd.DataFrame(fold_scores).mean().to_dict()

        results.append({
            "Encoder": encoder_name, "Model": model_name,
            "PR AUC": avg_scores['PR AUC'],
            "F1-Score": avg_scores['Best F1-Score'],
            "Precision": avg_scores['Precision at Best F1'],
            "Recall": avg_scores['Recall at Best F1'],
            "Duration (s)": duration
        })
        print(f"time: {duration:.2f} s, F1-Score: {avg_scores['Best F1-Score']:.4f}")


results_df = pd.DataFrame(results)
results_df.rename(columns={
    "F1-Score": "Best F1-Score",
    "Precision": "Precision at Best F1",
    "Recall": "Recall at Best F1"
}, inplace=True)
column_order = ['Encoder', 'Model', 'PR AUC', 'Best F1-Score', 'Precision at Best F1', 'Recall at Best F1', 'Duration (s)']
results_df_sorted = results_df[column_order].sort_values(by="Best F1-Score", ascending=False).reset_index(drop=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(results_df_sorted)


# Strategy 2

In [None]:
combined_encoders_to_test = {
    "WOE_plus_Frequency": FeatureUnion([
        ('woe_pipeline', Pipeline([('encoder', ce.WOEEncoder(regularization=40))])),
        ('freq_pipeline', Pipeline([('encoder', ce.CountEncoder(normalize=True))]))
    ]),
    "JS_plus_Frequency": FeatureUnion([
        ('js_pipeline', Pipeline([('encoder', ce.JamesSteinEncoder(model='binary'))])),
        ('freq_pipeline', Pipeline([('encoder', ce.CountEncoder(normalize=True))]))
    ]),
    "WOE_plus_Ordinal": FeatureUnion([
        ('woE_pipeline', Pipeline([('encoder', ce.WOEEncoder(regularization=40))])),
        ('ord_pipeline', Pipeline([('encoder', ce.OrdinalEncoder())]))
    ]),
    "JS_plus_Ordinal": FeatureUnion([
        ('js_pipeline', Pipeline([('encoder', ce.JamesSteinEncoder(model='binary'))])),
        ('ord_pipeline', Pipeline([('encoder', ce.OrdinalEncoder())]))
    ])
}

models_to_test = {
    "XGBoost": (xgb.XGBClassifier, {'random_state': 335, 'eval_metric': 'aucpr', 'tree_method': 'hist', 'device': 'cuda'}),
    "CatBoost": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU', 'verbose': 0})
}

all_results = []
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=335)
one_hot_encoder = ce.OneHotEncoder(handle_unknown='value', handle_missing='value', use_cat_names=True)

for encoder_name, combined_encoder in combined_encoders_to_test.items():
    for model_name, (model_class, model_params) in models_to_test.items():
        print(f"\n{encoder_name} + {model_name} {'='*25}")

        fold_results = []
        start_time = time.time()

        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            dynamic_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
            current_model_params = model_params.copy()
            if model_name == "XGBoost":
                current_model_params['scale_pos_weight'] = dynamic_scale_pos_weight

            model = model_class(**current_model_params)

            # FeatureUnion
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),

                    ('cat_low', one_hot_encoder, low_cardinality_features),

                    ('cat_high_combined', combined_encoder, high_cardinality_features)
                ],
                remainder='passthrough'
            )

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier', model)
            ])

            pipeline.fit(X_train, y_train)
            y_pred_proba = pipeline.predict_proba(X_val)[:, 1]

            precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
            f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
            best_f1_idx = np.argmax(f1_scores)

            fold_results.append({
                "PR AUC": average_precision_score(y_val, y_pred_proba),
                "Best F1-Score": f1_scores[best_f1_idx],
                "Precision at Best F1": precisions[best_f1_idx],
                "Recall at Best F1": recalls[best_f1_idx]
            })

        duration = time.time() - start_time
        avg_results = pd.DataFrame(fold_results).mean().to_dict()
        avg_results['Encoder_Combination'] = encoder_name
        avg_results['Model'] = model_name
        avg_results['Duration_s'] = duration
        all_results.append(avg_results)
        print(f"time: {duration:.2f} s, PR AUC: {avg_results['PR AUC']:.4f}, F1-Score: {avg_results['Best F1-Score']:.4f}")


print("\nResult: ")
results_df = pd.DataFrame(all_results)
column_order = ['Encoder_Combination', 'Model', 'PR AUC', 'Best F1-Score', 'Precision at Best F1', 'Recall at Best F1', 'Duration_s']
results_df_sorted = results_df[column_order].sort_values(by="PR AUC", ascending=False)
print(results_df_sorted.to_string(index=False))

In [None]:
def split_composite_errors(error_string: str) -> pd.Series:
    error_string = str(error_string).strip()
    if ',' in error_string:
        parts = error_string.split(',', 1)
        return pd.Series([parts[0].strip(), parts[1].strip()])
    else:
        return pd.Series([error_string, 'None'])

def create_final_feature_set_gnn(df: pd.DataFrame) -> pd.DataFrame:
    df_processed = df.copy()

    cols_to_clean_numeric = ['Credit Limit', 'Yearly Income - Person', 'Total Debt', 'Per Capita Income - Zipcode']
    for col in cols_to_clean_numeric:
        if col in df_processed.columns:
            df_processed[col] = pd.to_numeric(df_processed[col].astype(str).str.replace(r'[$,]', '', regex=True), errors='coerce')
    if 'Amount' in df_processed.columns:
        df_processed['Amount'] = df_processed['Amount'].abs()
    id_cols_to_str = ['User', 'Merchant Name', 'Zip', 'MCC', 'Card Brand', 'Card Type', 'Use Chip', 'Has Chip', 'State', 'Merchant State', 'Merchant City', 'City', 'Gender', 'Card on Dark Web']
    for col in id_cols_to_str:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].astype(str)
    if 'Errors?' in df_processed.columns:
        df_processed['Errors?'] = df_processed['Errors?'].fillna('No Error').astype(str)

    if all(c in df_processed.columns for c in ['Year', 'Month', 'Day', 'Time']):
        df_processed['trans_datetime'] = pd.to_datetime(df_processed[['Year', 'Month', 'Day']].astype(str).agg('-'.join, axis=1) + ' ' + df_processed['Time'], errors='coerce')
        df_processed['day_of_week_sin'] = np.sin(2 * np.pi * df_processed['trans_datetime'].dt.dayofweek / 7.0)
        df_processed['day_of_week_cos'] = np.cos(2 * np.pi * df_processed['trans_datetime'].dt.dayofweek / 7.0)
        df_processed['hour_sin'] = np.sin(2 * np.pi * df_processed['trans_datetime'].dt.hour / 24.0)
        df_processed['hour_cos'] = np.cos(2 * np.pi * df_processed['trans_datetime'].dt.hour / 24.0)
        if 'Acct Open Date' in df_processed.columns:
            acct_open_year = pd.to_datetime(df_processed['Acct Open Date'], format='%m/%Y', errors='coerce').dt.year
            df_processed['account_age_at_transaction'] = df_processed['Year'] - acct_open_year
    if 'State' in df_processed.columns and 'Merchant State' in df_processed.columns:
        df_processed['is_in_home_state'] = (df_processed['State'] == df_processed['Merchant State']).astype(int)
    if 'City' in df_processed.columns and 'Merchant City' in df_processed.columns:
        df_processed['is_in_home_city'] = (df_processed['City'] == df_processed['Merchant City']).astype(int)
    if 'Amount' in df_processed.columns and 'Credit Limit' in df_processed.columns:
        df_processed['amount_to_limit_ratio'] = df_processed['Amount'] / df_processed['Credit Limit'].replace(0, 1e-6)
    if 'Amount' in df_processed.columns and 'Yearly Income - Person' in df_processed.columns:
        df_processed['amount_to_personal_income_ratio'] = df_processed['Amount'] / df_processed['Yearly Income - Person'].replace(0, 1e-6)
    if 'Total Debt' in df_processed.columns and 'Yearly Income - Person' in df_processed.columns:
        df_processed['debt_to_income_ratio'] = df_processed['Total Debt'] / df_processed['Yearly Income - Person'].replace(0, 1e-6)
    if 'Amount' in df_processed.columns and 'Per Capita Income - Zipcode' in df_processed.columns:
        df_processed['amount_to_zip_income_ratio'] = df_processed['Amount'] / df_processed['Per Capita Income - Zipcode'].replace(0, 1e-6)
    if 'Errors?' in df_processed.columns:
        df_processed[['Error1_cat', 'Error2_cat']] = df_processed['Errors?'].apply(split_composite_errors)


    final_features_to_keep = [
        'Is Fraud?', 'User', 'Card', 'Amount', 'MCC', 'Card Brand', 'Card Type', 'Has Chip', 'Use Chip',
        'Cards Issued', 'Credit Limit', 'Current Age', 'Gender', 'FICO Score',
        'Num Credit Cards', 'Yearly Income - Person',
        'Merchant Name', 'Merchant State', 'Merchant City', 'State', 'City',
        'Total Debt',
        'Per Capita Income - Zipcode',
        'day_of_week_sin', 'day_of_week_cos', 'hour_sin', 'hour_cos',
        'account_age_at_transaction',
        'is_in_home_state', 'is_in_home_city',
        'amount_to_limit_ratio', 'amount_to_personal_income_ratio',
        'debt_to_income_ratio', 'amount_to_zip_income_ratio',
        'Error1_cat', 'Error2_cat'
    ]

    final_cols_exist = [col for col in final_features_to_keep if col in df_processed.columns]
    df_final = df_processed[final_cols_exist].copy()

    for col in df_final.columns:
        if df_final[col].dtype.name in ['object', 'category']:
            df_final[col] = df_final[col].fillna('Unknown')
        else:
            df_final[col] = df_final[col].fillna(df_final[col].median())

    print(f"dim: {df_final.shape}")
    return df_final

In [None]:
df_final = create_final_feature_set_gnn(df_full_context)

print("dim:", df_final.shape)
df_final.info()

In [None]:
# ID
df_final['user_id'], user_uniques = pd.factorize(df_final['User'])
df_final['card_id'], card_uniques = pd.factorize(df_final['User'].astype(str) + '_' + df_final['Card'].astype(str))
df_final['merchant_id'], merchant_uniques = pd.factorize(df_final['Merchant Name'])

# City, State, MCC
all_cities = pd.concat([df_final['City'], df_final['Merchant City']]).unique()
all_states = pd.concat([df_final['State'], df_final['Merchant State']]).unique()
all_mccs = df_final['MCC'].unique()

city_map = {city: i for i, city in enumerate(all_cities)}
state_map = {state: i for i, state in enumerate(all_states)}
mcc_map = {mcc: i for i, mcc in enumerate(all_mccs)}

df_final['user_city_id'] = df_final['City'].map(city_map)
df_final['merchant_city_id'] = df_final['Merchant City'].map(city_map)
df_final['user_state_id'] = df_final['State'].map(state_map)
df_final['merchant_state_id'] = df_final['Merchant State'].map(state_map)
df_final['mcc_id'] = df_final['MCC'].map(mcc_map)


print("\nHeteroData")
graph_data = HeteroData()

graph_data['user'].num_nodes = len(user_uniques)
graph_data['card'].num_nodes = len(card_uniques)
graph_data['merchant'].num_nodes = len(merchant_uniques)
graph_data['city'].num_nodes = len(all_cities)
graph_data['state'].num_nodes = len(all_states)
graph_data['mcc'].num_nodes = len(all_mccs)

# user
user_num_features = ['Current Age', 'FICO Score', 'Num Credit Cards', 'Yearly Income - Person', 'Total Debt']
user_cat_features = ['Gender']
user_features_df = df_final.groupby('user_id')[user_num_features + user_cat_features].first()
user_preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), user_num_features),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), user_cat_features)
])
processed_user_features = user_preprocessor.fit_transform(user_features_df)
graph_data['user'].x = torch.tensor(processed_user_features, dtype=torch.float)

# card
card_num_features = ['Credit Limit', 'Cards Issued']
card_cat_features = ['Card Brand', 'Card Type', 'Has Chip']
card_features_df = df_final.groupby('card_id')[card_num_features + card_cat_features].first()
card_preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), card_num_features),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), card_cat_features)
])
processed_card_features = card_preprocessor.fit_transform(card_features_df)
graph_data['card'].x = torch.tensor(processed_card_features, dtype=torch.float)


print("\nEdge Index")

# user-card
user_card_edges = df_final[['user_id', 'card_id']].drop_duplicates()
edge_index_user_owns_card = torch.tensor(user_card_edges.values.T, dtype=torch.long)
graph_data['user', 'owns', 'card'].edge_index = edge_index_user_owns_card
graph_data['card', 'owned_by', 'user'].edge_index = edge_index_user_owns_card[[1, 0]]

# card-merchant
card_merchant_edges = df_final[['card_id', 'merchant_id']]
edge_index_card_interacts_merchant = torch.tensor(card_merchant_edges.values.T, dtype=torch.long)
graph_data['card', 'interacts_with', 'merchant'].edge_index = edge_index_card_interacts_merchant
graph_data['merchant', 'rev_interacts_with', 'card'].edge_index = edge_index_card_interacts_merchant[[1, 0]]

# user-city
user_city_edges = df_final[['user_id', 'user_city_id']].drop_duplicates()
edge_index_user_lives_in_city = torch.tensor(user_city_edges.values.T, dtype=torch.long)
graph_data['user', 'lives_in', 'city'].edge_index = edge_index_user_lives_in_city
graph_data['city', 'hosts_user', 'user'].edge_index = edge_index_user_lives_in_city[[1, 0]]

# merchant-city
merchant_city_edges = df_final[['merchant_id', 'merchant_city_id']].drop_duplicates()
edge_index_merchant_in_city = torch.tensor(merchant_city_edges.values.T, dtype=torch.long)
graph_data['merchant', 'located_in', 'city'].edge_index = edge_index_merchant_in_city
graph_data['city', 'hosts_merchant', 'merchant'].edge_index = edge_index_merchant_in_city[[1, 0]]

# city-state
city_state_map_df = pd.concat([
    df_final[['user_city_id', 'user_state_id']].rename(columns={'user_city_id': 'city_id', 'user_state_id': 'state_id'}),
    df_final[['merchant_city_id', 'merchant_state_id']].rename(columns={'merchant_city_id': 'city_id', 'merchant_state_id': 'state_id'})
]).drop_duplicates().dropna()
edge_index_city_in_state = torch.tensor(city_state_map_df.values.T, dtype=torch.long)
graph_data['city', 'part_of', 'state'].edge_index = edge_index_city_in_state
graph_data['state', 'contains', 'city'].edge_index = edge_index_city_in_state[[1, 0]]

# merchant-MCC
merchant_mcc_edges = df_final[['merchant_id', 'mcc_id']].drop_duplicates()
edge_index_merchant_has_mcc = torch.tensor(merchant_mcc_edges.values.T, dtype=torch.long)
graph_data['merchant', 'has_category', 'mcc'].edge_index = edge_index_merchant_has_mcc
graph_data['mcc', 'is_category_of', 'merchant'].edge_index = edge_index_merchant_has_mcc[[1, 0]]

# Transaction edge

edge_numeric_cols = ['Amount', 'day_of_week_sin', 'day_of_week_cos', 'hour_sin', 'hour_cos']
edge_categorical_cols = ['Use Chip', 'Error1_cat', 'Error2_cat']

edge_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), edge_numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), edge_categorical_cols)
    ],
    remainder='drop'
)

edge_feature_source_cols = edge_numeric_cols + edge_categorical_cols
processed_edge_features = edge_preprocessor.fit_transform(df_final[edge_feature_source_cols])

# Transaction edge card-merchant
graph_data['card', 'interacts_with', 'merchant'].edge_attr = torch.tensor(processed_edge_features, dtype=torch.float)

print(f"dim: {processed_edge_features.shape[1]}")

print("Structure")
print(graph_data)

In [None]:
class HeteroGNN_Refactored(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, node_types, edge_types, node_feature_dims, dropout_rate=0.5):
        super().__init__()

        self.embeddings = torch.nn.ModuleDict()
        self.linears = torch.nn.ModuleDict()
        self.batch_norms = torch.nn.ModuleDict()
        for node_type in node_types:
            if node_type not in node_feature_dims:
                num_nodes = graph_data[node_type].num_nodes
                self.embeddings[node_type] = torch.nn.Embedding(num_nodes, hidden_channels)
            self.batch_norms[node_type] = BatchNorm(hidden_channels)

        for node_type, dim in node_feature_dims.items():
            self.linears[node_type] = Linear(dim, hidden_channels)

        self.conv1 = HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels) for edge_type in edge_types
        }, aggr='sum')

        self.conv2 = HeteroConv({
            edge_type: SAGEConv((-1, -1), out_channels) for edge_type in edge_types
        }, aggr='sum')

        self.dropout = torch.nn.Dropout(dropout_rate)

    def forward(self, x_dict, edge_index_dict):

        x_dict_processed = {}
        for node_type, x in x_dict.items():
            if node_type in self.linears:
                x = self.linears[node_type](x)
            else:
                x = self.embeddings[node_type](x)
            x = self.batch_norms[node_type](x).relu()
            x_dict_processed[node_type] = x

        x_dict = self.conv1(x_dict_processed, edge_index_dict)
        x_dict = {key: self.dropout(x.relu()) for key, x in x_dict.items()}

        x_dict = self.conv2(x_dict, edge_index_dict)

        return x_dict

# GNN
train_graph = graph_data

node_feature_dims = {}
for node_type in graph_data.node_types:
    if hasattr(graph_data[node_type], 'x') and graph_data[node_type].x is not None:
        node_feature_dims[node_type] = graph_data[node_type].x.shape[1]

gnn_model = HeteroGNN_Refactored(
    hidden_channels=256,
    out_channels=128,
    node_types=graph_data.node_types,
    edge_types=train_graph.edge_types,
    node_feature_dims=node_feature_dims,
    dropout_rate=0.5
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gnn_model = gnn_model.to(device)

print(gnn_model)

# train

edge_label_index = train_graph['card', 'interacts_with', 'merchant'].edge_index
train_loader = LinkNeighborLoader(
    data=train_graph,
    num_neighbors=[20, 10],
    edge_label_index=(('card', 'interacts_with', 'merchant'), edge_label_index),
    neg_sampling_ratio=3.0,
    batch_size=2048,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

# optimizer
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.001)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=2, min_lr=0.00001)

# training
start_time = time.time()

for epoch in range(1, 11):
    total_loss = 0
    total_examples = 0
    gnn_model.train()

    for batch in tqdm(train_loader, desc=f'Epoch {epoch:02d}'):
        batch = batch.to(device)
        optimizer.zero_grad()

        x_dict_input = {
            node_type: batch[node_type].x if hasattr(batch[node_type], 'x') else batch[node_type].n_id
            for node_type in batch.node_types
        }
        z_dict = gnn_model(x_dict_input, batch.edge_index_dict)

        edge_label_index = batch['card', 'interacts_with', 'merchant'].edge_label_index
        edge_label = batch['card', 'interacts_with', 'merchant'].edge_label

        src_emb = z_dict['card'][edge_label_index[0]]
        dst_emb = z_dict['merchant'][edge_label_index[1]]

        pred = (src_emb * dst_emb).sum(dim=-1)
        loss = F.binary_cross_entropy_with_logits(pred, edge_label)

        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()

    avg_loss = total_loss / total_examples
    scheduler.step(avg_loss)
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch}, Avg Loss: {avg_loss:.4f}")

total_duration = time.time() - start_time
print(f"\nTime: {total_duration:.2f} s")

In [None]:
gnn_model.eval()
gnn_model = gnn_model.to('cpu')
graph_data = graph_data.to('cpu')

with torch.no_grad():
    full_x_dict_input = {
        node_type: graph_data[node_type].x if hasattr(graph_data[node_type], 'x') else torch.arange(graph_data[node_type].num_nodes)
        for node_type in graph_data.node_types
    }
    final_embeddings = gnn_model(full_x_dict_input, graph_data.edge_index_dict)

user_embeddings = final_embeddings['user'].numpy()
merchant_embeddings = final_embeddings['merchant'].numpy()

user_emb_df = pd.DataFrame(user_embeddings, columns=[f'user_emb_{i}' for i in range(user_embeddings.shape[1])])
user_emb_df['User'] = user_uniques
merchant_emb_df = pd.DataFrame(merchant_embeddings, columns=[f'merchant_emb_{i}' for i in range(merchant_embeddings.shape[1])])
merchant_emb_df['Merchant Name'] = merchant_uniques

df_augmented = pd.merge(df_final, user_emb_df, on='User', how='left')
df_augmented = pd.merge(df_augmented, merchant_emb_df, on='Merchant Name', how='left')
print(f"Dim: {df_augmented.shape}")

TARGET = 'Is Fraud?'
cols_to_drop = [TARGET, 'User', 'Card', 'Merchant Name', 'MCC', 'Errors?', 'Card Brand', 'Card Type', 'Has Chip', 'Use Chip', 'Gender', 'Merchant State', 'Merchant City', 'State', 'City', 'Error1_cat', 'Error2_cat']
cols_to_drop_exist = [col for col in cols_to_drop if col in df_augmented.columns]
X_aug = df_augmented.drop(columns=cols_to_drop_exist)
y_aug = df_augmented[TARGET]

models_to_test = {
    "DecisionTree_GNN": (DecisionTreeClassifier, {'random_state': 335, 'class_weight': 'balanced'}),
    "RandomForest_GNN": (RandomForestClassifier, {'random_state': 335, 'n_jobs': -1, 'class_weight': 'balanced'}),
    "XGBoost_GNN": (xgb.XGBClassifier, {'random_state': 335, 'eval_metric': 'aucpr', 'tree_method': 'hist', 'device': 'cuda'}),
    "CatBoost_GNN": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU',  'verbose': 0}),
}
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_model_results = {}

for model_name, (model_class, model_params) in models_to_test.items():
    print(f"\n{model_name}")
    all_fold_results = []
    for fold, (train_idx, val_idx) in enumerate(tqdm(cv_strategy.split(X_aug, y_aug), total=5, desc=f"({model_name})")):
        X_train, X_val = X_aug.iloc[train_idx], X_aug.iloc[val_idx]
        y_train, y_val = y_aug.iloc[train_idx], y_aug.iloc[val_idx]

        dynamic_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
        current_model_params = model_params.copy()
        if model_name in ["XGBoost_GNN"]:
            current_model_params['scale_pos_weight'] = dynamic_scale_pos_weight

        current_model = model_class(**current_model_params)

        pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('classifier', current_model)])
        pipeline.fit(X_train, y_train)

        y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
        precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
        fscores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
        ix = np.argmax(fscores)
        best_threshold = thresholds[ix] if ix < len(thresholds) else 1.0
        y_pred_class_optimal = (y_pred_proba >= best_threshold).astype(int)

        fold_scores = {
            'PR AUC': average_precision_score(y_val, y_pred_proba),
            'F1-Score': f1_score(y_val, y_pred_class_optimal, zero_division=0),
            'Precision': precision_score(y_val, y_pred_class_optimal, zero_division=0),
            'Recall': recall_score(y_val, y_pred_class_optimal, zero_division=0),
            'Optimal Threshold': best_threshold
        }
        all_fold_results.append(fold_scores)

    results_df = pd.DataFrame(all_fold_results)
    all_model_results[model_name] = results_df.agg(['mean', 'std'])


for model_name, summary_df in all_model_results.items():
    print(f"\n{model_name}")
    print(summary_df)