<a href="https://colab.research.google.com/github/Chun1225/Imperial-Research-Project/blob/main/RP2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install pyg-lib using the recommended method from the error message
# This ensures compatibility with your installed PyTorch and CUDA versions
import torch

# Detect PyTorch and CUDA version
TORCH_VERSION = torch.__version__.split('+')[0]
CUDA_VERSION = torch.version.cuda

if CUDA_VERSION:
    # Format as e.g. 'cu118' or 'cu121'
    CUDA_str = f"cu{CUDA_VERSION.replace('.', '')}"
else:
    # If no CUDA, use cpu version
    CUDA_str = 'cpu'


# Construct and execute the correct installation command
# !pip is the syntax to run shell commands in Jupyter/Colab environments
!pip install pyg-lib -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+{CUDA_str}.html

In [None]:
pip install category_encoders catboost dataframe_image Selenium torch_geometric

In [None]:
import pandas as pd
import numpy as np
import time
import datetime
import warnings

import matplotlib.pyplot as plt
import seaborn as sns
import dataframe_image as dfi
from IPython.display import display, Image

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, average_precision_score, precision_recall_curve

import category_encoders as ce

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as cb

import torch
from torch_geometric.data import HeteroData
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, HeteroConv, Linear, BatchNorm
from torch_geometric.loader import LinkNeighborLoader
from tqdm.auto import tqdm
from torch_geometric.nn import GATv2Conv


In [None]:
try:
    df = pd.read_csv('/content/drive/MyDrive/fraudTrain.csv')
except FileNotFoundError:
    print("can't find 'fraud.csv'")
    exit()

print("info")
print(f"dim: {df.shape}")
display(df.head())
df.info()

In [None]:
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

# datetime
df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob_datetime'] = pd.to_datetime(df['dob'])

df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob_datetime'] = pd.to_datetime(df['dob'])

df['age'] = (df['trans_datetime'] - df['dob_datetime']).dt.days / 365.25
df['age'] = df['age'].astype(int)

# hour
hour_in_day = 24
df['hour_sin'] = np.sin(2 * np.pi * df['trans_datetime'].dt.hour / hour_in_day)
df['hour_cos'] = np.cos(2 * np.pi * df['trans_datetime'].dt.hour / hour_in_day)
df['hour'] = df['trans_datetime'].dt.hour

# day of week
day_in_week = 7
df['day_of_week_sin'] = np.sin(2 * np.pi * df['trans_datetime'].dt.dayofweek / day_in_week)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['trans_datetime'].dt.dayofweek / day_in_week)
df['day_of_week'] = df['trans_datetime'].dt.dayofweek.astype(str)

# distance

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371

    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance = R * c
    return distance

df['distance_km'] = haversine_distance(
    df['lat'], df['long'],
    df['merch_lat'], df['merch_long']
)

cols_to_drop = ['trans_date_trans_time', 'dob', 'trans_datetime', 'dob_datetime']
df_processed = df.drop(columns=cols_to_drop)
df_processed['cc_num'] = df_processed['cc_num'].astype(str)

cols_to_drop_for_baseline = ['first', 'last', 'street', 'trans_num', 'hour', 'day_of_week', 'lat', 'long', 'zip', 'merch_lat', 'merch_long']
df_processed = df_processed.drop(columns=cols_to_drop_for_baseline, errors='ignore')

print(f"dum: {df_processed.shape}")
display(df_processed.head())
df_processed.info()

In [None]:
fraud_distribution = df_processed['is_fraud'].value_counts(normalize=True) * 100
print(f"0: {fraud_distribution[0]:.4f}%")
print(f"1:   {fraud_distribution[1]:.4f}%")


# Cardinality

categorical_features = df_processed.select_dtypes(include=['object']).columns.tolist()

cardinality_data = []
for col in categorical_features:
    cardinality = df_processed[col].nunique()
    cardinality_data.append([col, cardinality])

cardinality_df = pd.DataFrame(cardinality_data, columns=['Feature', 'Cardinality'])
cardinality_df = cardinality_df.sort_values(by='Cardinality', ascending=False)

print(cardinality_df.to_string(index=False))


In [None]:
TARGET = 'is_fraud'
X = df_processed.drop(columns=[TARGET])
y = df_processed[TARGET]

numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

low_cardinality_features = [c for c in categorical_features if X[c].nunique(dropna=False) <= 2]
high_cardinality_features = [c for c in categorical_features if X[c].nunique(dropna=False) > 2]

encoders_to_test = {
    "Ordinal": ce.OrdinalEncoder(),
    "Frequency": ce.CountEncoder(normalize=True),
    "WOE": ce.WOEEncoder(regularization=60),
    "JamesStein": ce.JamesSteinEncoder(model='binary'),
    "MEstimate": ce.MEstimateEncoder(m=25),
}

one_hot_low = ce.OneHotEncoder(handle_missing='value', handle_unknown='value', use_cat_names=True)

models = {
    "DecisionTree": (DecisionTreeClassifier, {'random_state': 335, 'class_weight': 'balanced'}),
    "RandomForest": (RandomForestClassifier, {'random_state': 335, 'n_jobs': -1, 'min_samples_leaf': 3}),
    "XGBoost": (xgb.XGBClassifier, {'random_state': 335, 'eval_metric': 'aucpr', 'tree_method': 'hist', 'device': 'cuda'}),
    "CatBoost": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU', 'verbose': 0})
}

all_folds_list = []
all_results = []
trained_pipelines = {}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=335)

for encoder_name, encoder in encoders_to_test.items():
    for model_name, (model_class, model_params) in models.items():
        print(f"\n{encoder_name} + {model_name}")

        fold_results = []
        start_time = time.time()

        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            current_model_params = model_params.copy()

            model = model_class(**current_model_params)

            # Pipeline
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat_low', one_hot_low, low_cardinality_features),
                    ('cat_high', Pipeline([
                        ('encoder', encoder),
                        ('scaler', StandardScaler())
                    ]), high_cardinality_features),
                ],
                remainder='passthrough'
            )

            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('classifier', model)])

            X_train_to_fit = X_train
            y_train_to_fit = y_train

            pipeline.fit(X_train_to_fit, y_train_to_fit)

            if fold == 4:
                trained_pipelines[f"{encoder_name} + {model_name}"] = pipeline
            y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
            precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
            f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
            best_f1_score = np.max(f1_scores)
            best_f1_idx = np.argmax(f1_scores)

            all_folds_list.append({
                "Model": model_name,
                "Encoder": encoder_name,
                "Fold": fold,
                "F1-Score": best_f1_score
            })

            fold_results.append({
                "PR AUC": average_precision_score(y_val, y_pred_proba),
                "Best F1-Score": f1_scores[best_f1_idx],
                "Best Threshold": thresholds[best_f1_idx] if best_f1_idx < len(thresholds) else 1.0,
                "Precision at Best F1": precisions[best_f1_idx],
                "Recall at Best F1": recalls[best_f1_idx]
            })
        duration = time.time() - start_time
        fold_results_df = pd.DataFrame(fold_results)
        avg_results = fold_results_df.mean().to_dict()
        avg_results['Encoder'] = encoder_name
        avg_results['Model'] = model_name
        avg_results['Duration (s)'] = duration
        all_results.append(avg_results)
        print(f"time: {duration:.2f} s, Avg F1-Score: {avg_results['Best F1-Score']:.4f}, Avg PR AUC: {avg_results['PR AUC']:.4f}")

results_df = pd.DataFrame(all_results)
column_order = ['Encoder', 'Model', 'PR AUC', 'Best F1-Score', 'Precision at Best F1', 'Recall at Best F1', 'Best Threshold', 'Duration (s)']
results_df_sorted = results_df[column_order].sort_values(by="Best F1-Score", ascending=False)

print("Result:")
print(results_df_sorted.to_string(index=False))


In [None]:
TARGET = 'is_fraud'
X = df_processed.drop(columns=[TARGET])
y = df_processed[TARGET]

numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# <15
low_cardinality_features = [c for c in categorical_features if X[c].nunique(dropna=False) <= 15]
high_cardinality_features = [c for c in categorical_features if X[c].nunique(dropna=False) > 15]

encoders_to_test = {
    "Ordinal": ce.OrdinalEncoder(),
    "Frequency": ce.CountEncoder(normalize=True),
    "WOE": ce.WOEEncoder(regularization=60),
    "JamesStein": ce.JamesSteinEncoder(model='binary'),
    "MEstimate": ce.MEstimateEncoder(m=25),
}

one_hot_low = ce.OneHotEncoder(handle_missing='value', handle_unknown='value', use_cat_names=True)

models = {
    "XGBoost": (xgb.XGBClassifier, {
        'random_state': 335,
        'eval_metric': 'aucpr',
        'tree_method': 'hist',
        'device': 'cuda'
    }),
    "CatBoost": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU', 'verbose': 0})
}

all_folds_list = []
all_results = []
trained_pipelines = {}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=335)

for encoder_name, encoder in encoders_to_test.items():
    for model_name, (model_class, model_params) in models.items():
        print(f"\n{encoder_name} + {model_name}")

        fold_results = []
        start_time = time.time()

        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            current_model_params = model_params.copy()

            model = model_class(**current_model_params)

            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),
                    ('cat_low', one_hot_low, low_cardinality_features),
                    ('cat_high', Pipeline([
                        ('encoder', encoder),
                        ('scaler', StandardScaler())
                    ]), high_cardinality_features),
                ],
                remainder='passthrough'
            )

            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('classifier', model)
            ])

            X_train_to_fit = X_train
            y_train_to_fit = y_train

            pipeline.fit(X_train_to_fit, y_train_to_fit)

            if fold == 4:
                trained_pipelines[f"{encoder_name} + {model_name}"] = pipeline

            y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
            precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
            f1_scores = np.divide(2 * precisions * recalls, precisions + recalls,
                                  out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
            best_f1_idx = np.argmax(f1_scores)
            best_f1_score = f1_scores[best_f1_idx]
            pr_auc = average_precision_score(y_val, y_pred_proba)

            best_thr = thresholds[best_f1_idx] if best_f1_idx < len(thresholds) else 1.0
            prec_best = precisions[best_f1_idx]
            rec_best = recalls[best_f1_idx]

            all_folds_list.append({
                "Model": model_name,
                "Encoder": encoder_name,
                "Fold": fold,
                "PR AUC": pr_auc,
                "F1-Score": best_f1_score,
                "Precision": prec_best,
                "Recall": rec_best,
                "Best Threshold": best_thr
            })

            fold_results.append({
                "PR AUC": pr_auc,
                "Best F1-Score": best_f1_score,
                "Precision at Best F1": prec_best,
                "Recall at Best F1": rec_best,
                "Best Threshold": best_thr
            })

        duration = time.time() - start_time
        fold_results_df = pd.DataFrame(fold_results)
        avg_results = fold_results_df.mean().to_dict()
        avg_results['Encoder'] = encoder_name
        avg_results['Model'] = model_name
        avg_results['Duration (s)'] = duration
        all_results.append(avg_results)

        print(f"time: {duration:.2f} s,  Avg F1-Score: {avg_results['Best F1-Score']:.4f}, Avg PR AUC: {avg_results['PR AUC']:.4f}")

results_df = pd.DataFrame(all_results)
column_order = ['Encoder', 'Model', 'PR AUC', 'Best F1-Score',
                'Precision at Best F1', 'Recall at Best F1',
                'Best Threshold', 'Duration (s)']
results_df_sorted = results_df[column_order].sort_values(by="Best F1-Score", ascending=False)

print("Result:")
print(results_df_sorted.to_string(index=False))


In [None]:
combined_encoders_to_test = {
    "WOE_plus_Frequency": FeatureUnion([
        ('woe_pipeline', Pipeline([('encoder', ce.WOEEncoder(regularization=60))])),
        ('freq_pipeline', Pipeline([('encoder', ce.CountEncoder(normalize=True))]))
    ]),
    "JS_plus_Frequency": FeatureUnion([
        ('js_pipeline', Pipeline([('encoder', ce.JamesSteinEncoder(model='binary'))])),
        ('freq_pipeline', Pipeline([('encoder', ce.CountEncoder(normalize=True))]))
    ]),
    "WOE_plus_JS": FeatureUnion([
        ('woe_pipeline', Pipeline([('encoder', ce.WOEEncoder(regularization=60))])),
        ('js_pipeline', Pipeline([('encoder', ce.JamesSteinEncoder(model='binary'))]))
    ]),
    "JS_plus_Ordinal": FeatureUnion([
        ('js_pipeline', Pipeline([('encoder', ce.JamesSteinEncoder(model='binary'))])),
        ('ord_pipeline', Pipeline([('encoder', ce.OrdinalEncoder())]))
    ])
}

models_to_test = {
    "XGBoost": xgb.XGBClassifier(random_state=335, eval_metric='aucpr', tree_method='hist', device='cuda'),
    "CatBoost": cb.CatBoostClassifier(random_state=335, verbose=0, task_type='GPU')
}

all_results = []
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=335)

for encoder_name, combined_encoder in combined_encoders_to_test.items():
    for model_name, model_instance in models_to_test.items():
        print(f"\n{encoder_name} + {model_name}")

        fold_results = []
        start_time = time.time()

        for fold, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), numerical_features),

                    ('cat_low', ce.OneHotEncoder(handle_missing='value', handle_unknown='value', use_cat_names=True), low_cardinality_features),

                    ('cat_high_combined', combined_encoder, high_cardinality_features)
                ],
                remainder='passthrough'
            )

            final_pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier', model_instance)
            ])

            final_pipeline.fit(X_train, y_train)
            y_pred_proba = final_pipeline.predict_proba(X_val)[:, 1]

            pr_auc = average_precision_score(y_val, y_pred_proba)

            precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
            f1_scores = np.divide(2 * precisions * recalls, precisions + recalls, out=np.zeros_like(precisions), where=(precisions + recalls) != 0)
            best_f1_idx = np.argmax(f1_scores)

            fold_results.append({
                "PR AUC": pr_auc,
                "Best F1-Score": f1_scores[best_f1_idx],
                "Precision at Best F1": precisions[best_f1_idx],
                "Recall at Best F1": recalls[best_f1_idx]
            })


        duration = time.time() - start_time
        fold_results_df = pd.DataFrame(fold_results)
        avg_results = fold_results_df.mean().to_dict()
        avg_results['Encoder_Combination'] = encoder_name
        avg_results['Model'] = model_name
        avg_results['Duration_s'] = duration
        all_results.append(avg_results)
        print(f"time: {duration:.2f} s, Avg PR AUC: {avg_results['PR AUC']:.4f}, Avg F1-Score: {avg_results['Best F1-Score']:.4f}")

results_df = pd.DataFrame(all_results)

column_order = [
    'Encoder_Combination', 'Model', 'PR AUC', 'Best F1-Score',
    'Precision at Best F1', 'Recall at Best F1', 'Duration_s'
]
results_df_sorted = results_df[column_order].sort_values(by="PR AUC", ascending=False)

print("Result:")
print(results_df_sorted.to_string(index=False))

In [None]:
if 'trained_pipelines' not in locals() or not trained_pipelines:
    print("can't find 'trained_pipelines'")

else:
    for combination_name, pipeline in trained_pipelines.items():
        print(f"\nFeature important: {combination_name}")

        try:

            trained_model = pipeline.named_steps['classifier']
            fitted_preprocessor = pipeline.named_steps['preprocessor']

            if not hasattr(trained_model, 'feature_importances_'):
                continue

            feature_names_out = fitted_preprocessor.get_feature_names_out()

            importances = trained_model.feature_importances_

            importance_df = pd.DataFrame({
                'Feature': feature_names_out,
                'Importance': importances
            }).sort_values(by='Importance', ascending=False).head(20)

            print(importance_df.to_string(index=False))

        except Exception as e:
            print(e)



In [None]:
try:
    df = pd.read_csv('/content/drive/MyDrive/fraudTrain.csv')
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    print(f"dim: {df.shape}")

except FileNotFoundError:
    print("can't find")
    exit()

df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob_datetime'] = pd.to_datetime(df['dob'])
df['age'] = ((df['trans_datetime'] - df['dob_datetime']).dt.days / 365.25).astype(int)

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1_rad, lon1_rad, lat2_rad, lon2_rad = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

df['distance_km'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

# ID
customer_codes, customer_uniques = pd.factorize(df['cc_num'])
df['customer_id'] = customer_codes
customer_mapping = {cc_num: i for i, cc_num in enumerate(customer_uniques)}

merchant_codes, merchant_uniques = pd.factorize(df['merchant'])
df['merchant_id'] = merchant_codes
merchant_mapping = {merchant: i for i, merchant in enumerate(merchant_uniques)}

job_codes, job_uniques = pd.factorize(df['job'])
df['job_id'] = job_codes
job_mapping = {job: i for i, job in enumerate(job_uniques)}

category_codes, category_uniques = pd.factorize(df['category'])
df['category_id'] = category_codes
category_mapping = {category: i for i, category in enumerate(category_uniques)}

city_codes, city_uniques = pd.factorize(df['city'])
df['city_id'] = city_codes
city_mapping = {city: i for i, city in enumerate(city_uniques)}

state_codes, state_uniques = pd.factorize(df['state'])
df['state_id'] = state_codes
state_mapping = {state: i for i, state in enumerate(state_uniques)}

# HeteroData
graph_data = HeteroData()

graph_data['customer'].num_nodes = df['customer_id'].max() + 1
graph_data['merchant'].num_nodes = df['merchant_id'].max() + 1
graph_data['job'].num_nodes = df['job_id'].max() + 1
graph_data['category'].num_nodes = df['category_id'].max() + 1
graph_data['city'].num_nodes = df['city_id'].max() + 1
graph_data['state'].num_nodes = df['state_id'].max() + 1

customer_num_features = ['age', 'lat', 'long', 'city_pop']
customer_cat_features = ['gender']
customer_features_df = df.drop_duplicates(subset=['customer_id']).set_index('customer_id').sort_index()
customer_preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), customer_num_features),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), customer_cat_features)
])
processed_customer_features = customer_preprocessor.fit_transform(customer_features_df[customer_num_features + customer_cat_features])
graph_data['customer'].x = torch.tensor(processed_customer_features, dtype=torch.float)

merchant_num_features = ['merch_lat', 'merch_long']
merchant_features_df = df.drop_duplicates(subset=['merchant_id']).set_index('merchant_id').sort_index()
processed_merchant_features = StandardScaler().fit_transform(merchant_features_df[merchant_num_features])
graph_data['merchant'].x = torch.tensor(processed_merchant_features, dtype=torch.float)

# Edge

edge_type = ('customer', 'performs_transaction', 'merchant')
edges = df[['customer_id', 'merchant_id']].values.T
edge_index = torch.tensor(edges, dtype=torch.long)
graph_data[edge_type].edge_index = edge_index
rev_edge_type = (edge_type[2], f"rev_{edge_type[1]}", edge_type[0])
graph_data[rev_edge_type].edge_index = edge_index[[1, 0]]
print(f"transaction edge: {edge_type} ({edge_index.shape[1]})")

structural_edge_definitions = {
    ('merchant', 'has_category', 'category'): ['merchant_id', 'category_id'],
    ('customer', 'has_job', 'job'): ['customer_id', 'job_id'],
    ('customer', 'lives_in', 'city'): ['customer_id', 'city_id'],
    ('city', 'is_in_state', 'state'): ['city_id', 'state_id'],
    ('merchant', 'located_in', 'city'): ['merchant_id', 'city_id']
}
for edge_type, (src_col, dst_col) in structural_edge_definitions.items():
    edges = df[[src_col, dst_col]].drop_duplicates().values.T
    edge_index = torch.tensor(edges, dtype=torch.long)
    graph_data[edge_type].edge_index = edge_index
    rev_edge_type = (edge_type[2], f"rev_{edge_type[1]}", edge_type[0])
    graph_data[rev_edge_type].edge_index = edge_index[[1, 0]]


df['hour_sin'] = np.sin(2 * np.pi * df['trans_datetime'].dt.hour / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['trans_datetime'].dt.hour / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['trans_datetime'].dt.dayofweek / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['trans_datetime'].dt.dayofweek / 7)

edge_feature_cols = [
    'amt',
    'distance_km', # 將 distance_km 加入邊的特徵
    'hour_sin', 'hour_cos',
    'day_of_week_sin', 'day_of_week_cos'
]
edge_preprocessor = StandardScaler()
processed_edge_features = edge_preprocessor.fit_transform(df[edge_feature_cols])

graph_data['customer', 'performs_transaction', 'merchant'].edge_attr = torch.tensor(processed_edge_features, dtype=torch.float)

print(f"dim: {processed_edge_features.shape[1]}")

print("Structure")
print(graph_data)

In [None]:
# GNN

class HeteroGNN_Optimized(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, node_types, edge_types, node_feature_dims, dropout_rate=0.5):
        super().__init__()

        self.embeddings = torch.nn.ModuleDict()
        self.linears = torch.nn.ModuleDict()
        self.batch_norms = torch.nn.ModuleDict()
        for node_type in node_types:
            if node_type not in node_feature_dims:
                num_nodes = graph_data[node_type].num_nodes
                self.embeddings[node_type] = torch.nn.Embedding(num_nodes, hidden_channels)
            self.batch_norms[node_type] = BatchNorm(hidden_channels)

        for node_type, dim in node_feature_dims.items():
            self.linears[node_type] = Linear(dim, hidden_channels)

        self.conv1 = HeteroConv({
            edge_type: SAGEConv((-1, -1), hidden_channels) for edge_type in edge_types
        }, aggr='sum')

        self.conv2 = HeteroConv({
            edge_type: SAGEConv((-1, -1), out_channels) for edge_type in edge_types
        }, aggr='sum')

        self.dropout = torch.nn.Dropout(dropout_rate)

    def forward(self, x_dict, edge_index_dict):
        x_dict_processed = {}
        for node_type, x in x_dict.items():
            if node_type in self.linears:
                x = self.linears[node_type](x)
            else:
                x = self.embeddings[node_type](x)
            x = self.batch_norms[node_type](x).relu()
            x_dict_processed[node_type] = x

        x_dict = self.conv1(x_dict_processed, edge_index_dict)
        x_dict = {key: self.dropout(x.relu()) for key, x in x_dict.items()}

        x_dict = self.conv2(x_dict, edge_index_dict)

        return x_dict


# Train

train_graph = graph_data
node_feature_dims = {}
for node_type in graph_data.node_types:
    if hasattr(graph_data[node_type], 'x') and graph_data[node_type].x is not None:
        node_feature_dims[node_type] = graph_data[node_type].x.shape[1]

gnn_model = HeteroGNN_Optimized(
    hidden_channels=256,
    out_channels=128,
    node_types=graph_data.node_types,
    edge_types=train_graph.edge_types,
    node_feature_dims=node_feature_dims,
    dropout_rate=0.5
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gnn_model = gnn_model.to(device)


edge_label_index = train_graph['customer', 'performs_transaction', 'merchant'].edge_index
train_loader = LinkNeighborLoader(
    data=train_graph,
    num_neighbors=[20, 10],
    edge_label_index=(('customer', 'performs_transaction', 'merchant'), edge_label_index),
    neg_sampling_ratio=5.0,
    batch_size=2048,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.001)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=2, min_lr=0.00001)


for epoch in range(1, 11):
    total_loss = 0
    total_examples = 0
    gnn_model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch:02d}'):
        batch = batch.to(device)
        optimizer.zero_grad()

        x_dict_input = {
            node_type: batch[node_type].x if hasattr(batch[node_type], 'x') else batch[node_type].n_id
            for node_type in batch.node_types
        }
        z_dict = gnn_model(x_dict_input, batch.edge_index_dict)

        edge_label_index = batch['customer', 'performs_transaction', 'merchant'].edge_label_index
        edge_label = batch['customer', 'performs_transaction', 'merchant'].edge_label
        src_emb = z_dict['customer'][edge_label_index[0]]
        dst_emb = z_dict['merchant'][edge_label_index[1]]
        pred = (src_emb * dst_emb).sum(dim=-1)
        loss = F.binary_cross_entropy_with_logits(pred, edge_label)

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()

    avg_loss = total_loss / total_examples

    scheduler.step(avg_loss)

    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch}, Avg Loss: {avg_loss:.4f}")


In [None]:
gnn_model.eval()
gnn_model = gnn_model.to('cpu')
graph_data = graph_data.to('cpu')

with torch.no_grad():
    full_x_dict_input = {
        node_type: (
            graph_data[node_type].x
            if hasattr(graph_data[node_type], 'x')
            else torch.arange(graph_data[node_type].num_nodes)
        )
        for node_type in graph_data.node_types
    }
    final_embeddings = gnn_model(full_x_dict_input, graph_data.edge_index_dict)

customer_embeddings = final_embeddings['customer'].cpu().numpy()
merchant_embeddings = final_embeddings['merchant'].cpu().numpy()

rev_customer_mapping = {v: k for k, v in customer_mapping.items()}
rev_merchant_mapping = {v: k for k, v in merchant_mapping.items()}

customer_emb_df = pd.DataFrame(customer_embeddings, columns=[f'c_emb_{i}' for i in range(customer_embeddings.shape[1])])
customer_emb_df['cc_num'] = customer_emb_df.index.map(rev_customer_mapping)

merchant_emb_df = pd.DataFrame(merchant_embeddings, columns=[f'm_emb_{i}' for i in range(merchant_embeddings.shape[1])])
merchant_emb_df['merchant'] = merchant_emb_df.index.map(rev_merchant_mapping)

df_augmented = pd.merge(df, customer_emb_df, on='cc_num', how='left')
df_augmented = pd.merge(df_augmented, merchant_emb_df, on='merchant', how='left')
print(f"dim: {df_augmented.shape}")

TARGET = 'is_fraud'
features_to_drop = [
    TARGET, 'trans_date_trans_time', 'dob', 'trans_datetime', 'dob_datetime',
    'cc_num', 'first', 'last', 'street', 'trans_num', 'merchant',
    'category', 'gender', 'job', 'city', 'state',
]
features_to_drop.extend([col for col in df_augmented.columns if '_id' in col])

X_aug = df_augmented.drop(columns=features_to_drop, errors='ignore')
y_aug = df_augmented[TARGET]
X_aug.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_aug.columns]


models_to_test = {
    "DecisionTree": (DecisionTreeClassifier, {'random_state': 335, 'class_weight': 'balanced'}),
    "RandomForest": (RandomForestClassifier, {'random_state': 335, 'n_jobs': -1, 'min_samples_leaf': 3}),
    "XGBoost": (xgb.XGBClassifier, {'random_state': 335, 'eval_metric': 'aucpr', 'tree_method': 'hist', 'device': 'cuda'}),
    "CatBoost": (cb.CatBoostClassifier, {'random_state': 335, 'task_type': 'GPU', 'verbose': 0})
}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_model_results = {}

for model_name, (model_class, model_params) in models_to_test.items():
    print(f"\n{model_name}")

    all_fold_results = []

    for fold, (train_idx, val_idx) in enumerate(tqdm(cv_strategy.split(X_aug, y_aug), total=5, desc=f"{model_name}")):
        X_train, X_val = X_aug.iloc[train_idx], X_aug.iloc[val_idx]
        y_train, y_val = y_aug.iloc[train_idx], y_aug.iloc[val_idx]

        dynamic_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
        current_model_params = model_params.copy()

        if model_name in ["XGBoost"]:
                current_model_params['scale_pos_weight'] = dynamic_scale_pos_weight

        current_model = model_class(**current_model_params)

        pipeline = Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('classifier', current_model)
        ])

        pipeline.fit(X_train, y_train)

        y_pred_proba = pipeline.predict_proba(X_val)[:, 1]

        precisions, recalls, thresholds = precision_recall_curve(y_val, y_pred_proba)
        fscores = np.divide(2 * precisions * recalls, precisions + recalls,
                            out=np.zeros_like(precisions), where=(precisions + recalls) != 0)

        ix = np.argmax(fscores)
        best_threshold = thresholds[ix] if ix < len(thresholds) else 1.0
        y_pred_class_optimal = (y_pred_proba >= best_threshold).astype(int)

        fold_scores = {
            'PR AUC': average_precision_score(y_val, y_pred_proba),
            'F1-Score': f1_score(y_val, y_pred_class_optimal, zero_division=0),
            'Precision': precision_score(y_val, y_pred_class_optimal, zero_division=0),
            'Recall': recall_score(y_val, y_pred_class_optimal, zero_division=0),
            'Optimal Threshold': best_threshold
        }
        all_fold_results.append(fold_scores)

    results_df = pd.DataFrame(all_fold_results)
    mean_scores = results_df.mean()
    all_model_results[model_name] = mean_scores

for model_name, mean_scores in all_model_results.items():
    print(f"\n{model_name}")
    print(mean_scores.to_string())
