# CATCHM pipeline [demo]

This notebook is in many ways identical to the *CATCHM_demo.ipynb* notebook.
However, in this notebook CATCHM is implemented as a [ScikitLearn compatible pipeline object](https://scikit-learn.org/stable/modules/compose.html). 
This allows you to experiment with different classifiers and replace the default [XGBoost model](https://xgboost.readthedocs.io/en/stable/python/index.html).

In [1]:
!pip install --upgrade numpy pandas nodevectors xgboost fucc optuna scikit-learn cupy-cuda12x numba scipy==1.11.4 networkx==2.5.1

Collecting numpy
  Downloading numpy-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nodevectors
  Downloading nodevectors-0.1.23.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting fucc
  Downloading fucc-0.0.8-py3-none-any.whl.metadata (650 bytes)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting cupy-cuda12x
  Downloading cupy_cuda12x-13.4.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting numba
  Downloading numba-0.61.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manyl

In [2]:
from networkx.readwrite import edgelist
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
import torch
import networkx as nx
from nodevectors import Node2Vec
import pandas as pd
from multiprocessing import Pool
from functools import partial
from tqdm import tqdm

In [3]:
def inductive_pooling(edgelist, embeddings, G, workers, gamma=1000, dict_node=None, 
                     average_embedding=True, use_gpu=False, device=None):
    """
    GPU-accelerated inductive pooling implementation
    """
    # Convert edgelist to array
    edgearray = np.array([[str(id), v[0], v[1]] for id, v in enumerate(edgelist)])
    
    # Calculate average embedding
    if average_embedding:
        if use_gpu and device and device.type == 'cuda':
            # Calculate on GPU
            emb_tensor = torch.tensor(embeddings, device=device)
            avg_emb = emb_tensor.mean(dim=0).cpu().numpy()
        else:
            # Calculate on CPU
            avg_emb = embeddings.mean(axis=0)
    else:
        avg_emb = None
    
    # Split processing based on GPU availability
    if use_gpu and device and device.type == 'cuda' and len(edgearray) > 1000:
        # For large datasets on GPU, we'll process in batches
        if workers > 1:
            print("Note: Using GPU with multiple workers. This may not be optimal for all systems.")
        
        result_list = []
        split_arrays = np.array_split(edgearray, workers)
        
        for batch in tqdm(split_arrays, total=len(split_arrays)):
            result = gpu_inductive_pooling_batch(batch, embeddings, G, average_embedding=avg_emb, device=device)
            result_list.append(result)
    else:
        # Use CPU multiprocessing for smaller datasets or if GPU is not available
        result_list = []
        with Pool(workers) as p:
            for result in tqdm(p.imap(partial(inductive_pooling_chunk, 
                                             embeddings=embeddings, 
                                             G=G, 
                                             average_embedding=avg_emb), 
                                     np.array_split(edgearray, workers)), 
                              total=len(np.array_split(edgearray, workers))):
                result_list.append(result)
    
    # Combine results
    new_embeddings = np.zeros((len(edgelist), embeddings.shape[1]))
    for result_dict in result_list:
        for id, emb in result_dict.items():
            new_embeddings[int(id), :] = emb
    
    return new_embeddings

def gpu_inductive_pooling_batch(edgearray, embeddings, G, gamma=1000, average_embedding=None, device=None):
    """
    GPU-accelerated version of inductive pooling for batch processing
    """
    # Convert embeddings to GPU tensor if not already
    if not torch.is_tensor(embeddings):
        embeddings_tensor = torch.tensor(embeddings, device=device)
    else:
        embeddings_tensor = embeddings
        
    # Create container for new embeddings
    new_embeddings = dict()
    
    for row in edgearray:
        transfer, sender, receiver = row
        mutual = False
        
        if G.has_node(sender) and G.has_node(receiver):
            mutual_neighbors = list(set(G.neighbors(sender)).intersection(set(G.neighbors(receiver))))
            # Convert string ids to numerical ids 
            mutual_neighbors = list(map(int, mutual_neighbors))
            # Sort numerical ids
            mutual_neighbors.sort()
            
            if len(mutual_neighbors) > 0:
                mutual = True
                # Take most recent mutual neighbor
                most_recent_mutual_neighbor = mutual_neighbors[-1]
                # Get embedding from GPU tensor
                most_recent_embedding = embeddings_tensor[most_recent_mutual_neighbor].cpu().numpy()
                new_embeddings[transfer] = most_recent_embedding
                
        if G.has_node(sender) and (not mutual):
            sender_neighbors = list(map(int, G.neighbors(sender)))
            pooled_embedding = get_pooled_embedding_gpu(sender_neighbors, embeddings_tensor, gamma, device)
            new_embeddings[transfer] = pooled_embedding
            
        elif G.has_node(receiver) and (not mutual):
            receiver_neighbors = list(map(int, G.neighbors(receiver)))
            pooled_embedding = get_pooled_embedding_gpu(receiver_neighbors, embeddings_tensor, gamma, device)
            new_embeddings[transfer] = pooled_embedding
            
        elif not mutual:
            # Use average embedding as fallback
            if torch.is_tensor(average_embedding):
                new_embeddings[transfer] = average_embedding.cpu().numpy()
            else:
                new_embeddings[transfer] = average_embedding
                
    return new_embeddings

def get_pooled_embedding_gpu(neighbors, embeddings_tensor, gamma, device):
    """
    GPU-accelerated version of pooled embedding calculation
    """
    # Extract embeddings for neighbors
    if len(neighbors) == 0:
        # Return zeros if no neighbors
        return torch.zeros(embeddings_tensor.shape[1], device=device).cpu().numpy()
    
    # Get indices for neighbors
    indices = torch.tensor(neighbors, device=device)
    
    # Use only the most recent gamma neighbors
    start_idx = max(0, len(neighbors) - gamma)
    indices = indices[start_idx:]
    
    # Get embeddings for these neighbors
    neighbor_embeddings = torch.index_select(embeddings_tensor, 0, indices)
    
    # Calculate mean embedding
    pooled_embedding = torch.mean(neighbor_embeddings, dim=0)
    
    # Return as numpy array
    return pooled_embedding.cpu().numpy()

def inductive_pooling_chunk(edgearray, embeddings, G, gamma=1000, average_embedding=None):
    """
    CPU version of inductive pooling for a chunk of edges
    """
    # Create a container for the new embeddings
    new_embeddings = dict()
    for row in edgearray:
        transfer, sender, receiver = row
        mutual = False    
        if G.has_node(sender) and G.has_node(receiver):
            mutual_neighbors = list(set(G.neighbors(sender)).intersection(set(G.neighbors(receiver))))
            # convert string ids to numerical ids 
            mutual_neighbors = list(map(int, mutual_neighbors))
            # sort numerical ids
            mutual_neighbors.sort()
            
            if (len(mutual_neighbors) > 0): 
                mutual = True
                # take most recent mutual neighbor
                most_recent_mutual_neighbor = mutual_neighbors[-1]
                # Use dataframe with TX_ID on index (to speed up retrieval of transfer rows)
                most_recent_embedding_mutual_neighbor = embeddings[most_recent_mutual_neighbor, :]
                new_embeddings[transfer] = most_recent_embedding_mutual_neighbor
                
                        
        if G.has_node(sender) and (not mutual):
            sender_neighbors = list(map(int, G.neighbors(sender)))
            pooled_embedding = get_pooled_embedding(sender_neighbors, embeddings, gamma)
            
            new_embeddings[transfer] = pooled_embedding
            
        elif G.has_node(receiver) and (not mutual):
            receiver_neighbors = list(map(int, G.neighbors(receiver)))
            pooled_embedding = get_pooled_embedding(receiver_neighbors, embeddings, gamma)
            new_embeddings[transfer] = pooled_embedding
            
            
        elif (not mutual):
            new_embeddings[transfer] = average_embedding
                    
    return new_embeddings
                            
def get_pooled_embedding(neighbors, embeddings, gamma):
    """
    CPU version of pooled embedding calculation
    """
    if len(neighbors) == 0:
        # Return zeros if no neighbors
        return np.zeros(embeddings.shape[1])
        
    embeddings_to_pool = embeddings[neighbors, :]
    most_recent_embeddings_to_pool = embeddings_to_pool[-min(gamma, embeddings_to_pool.shape[0]):, :]
    
    pooled_embedding = most_recent_embeddings_to_pool.mean(axis=0)
    
    return pooled_embedding

# Assume EpochLogger implementation is needed
class EpochLogger:
    """
    Callback to log epoch progress
    """
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_end(self, model):
        self.epoch += 1
        print(f"Completed epoch {self.epoch}")

In [4]:
import networkx as nx
import pandas as pd
import numpy as np
import torch
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def create_network(X_train, y_train, use_gpu=True, batch_size=10000, verbose=True):
    """
    GPU-accelerated function to create a network structure optimized for fraud detection.
    
    Parameters
    ----------
    X_train : pandas.DataFrame
        DataFrame containing transaction features
    y_train : pandas.Series
        Series containing fraud labels (1 for fraud, 0 for non-fraud)
    use_gpu : bool, default=True
        Whether to use GPU acceleration
    batch_size : int, default=10000
        Batch size for GPU operations
    verbose : bool, default=True
        Whether to show progress bars
        
    Returns
    -------
    G : networkx.Graph
        Graph with nodes representing transactions, customers, merchants, and an artificial fraud node
    """
    # Check if GPU is available when requested
    if use_gpu:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if verbose and device.type == 'cuda':
            print(f"Using GPU for network creation: {torch.cuda.get_device_name(0)}")
        elif verbose:
            print("GPU requested but not available. Using CPU instead.")
    else:
        device = torch.device('cpu')
        if verbose:
            print("Using CPU as requested.")
    
    # Create graph
    G = nx.Graph()
    
    # Generate IDs
    transaction_ids = [f"txn_{i}" for i in range(len(X_train))]
    customer_ids = [f"cust_{str(cid)}" for cid in X_train['customerId']]
    merchant_ids = [f"merch_{name}_{country}" for name, country in 
                  zip(X_train['merchantName'], X_train['merchantCountryCode'])]
    
    if verbose:
        print("Adding transaction nodes...")
    
    # Add nodes with attributes
    for i, txn_id in enumerate(tqdm(transaction_ids) if verbose else transaction_ids):
        # Add transaction node with relevant features
        G.add_node(txn_id, 
                  type='transaction',
                  amount_zscore=X_train.iloc[i]['amount_zscore'],
                  amount_to_avg_ratio=X_train.iloc[i]['amount_to_avg_ratio'],
                  is_foreign=X_train.iloc[i]['is_foreign_transaction'],
                  cvv_match=X_train.iloc[i]['cvv_match'],
                  exp_date_match=X_train.iloc[i]['exp_date_match'])
    
    if verbose:
        print("Adding customer and merchant nodes...")
    
    # Add customer and merchant nodes
    G.add_nodes_from(set(customer_ids), type='customer')
    G.add_nodes_from(set(merchant_ids), type='merchant')
    
    if verbose:
        print("Creating transaction-entity edges...")
    
    # Create edges between transactions and entities
    for i, txn_id in enumerate(tqdm(transaction_ids) if verbose else transaction_ids):
        # Connect transaction to customer
        G.add_edge(txn_id, customer_ids[i], edge_type='customer_transaction')
        
        # Connect transaction to merchant
        G.add_edge(txn_id, merchant_ids[i], edge_type='merchant_transaction')
    
    if verbose:
        print("Processing merchant proximity...")
    
    # Extract merchant coordinates
    merchant_df = pd.DataFrame({
        'merchant_id': merchant_ids,
        'lat': X_train['merchant_lat'].values,
        'lon': X_train['merchant_lon'].values
    }).drop_duplicates('merchant_id')
    
    # Filter out merchants with invalid coordinates
    valid_merchant_df = merchant_df.dropna()
    
    # GPU-accelerated geographical proximity calculation
    if len(valid_merchant_df) > 0:
        if use_gpu and device.type == 'cuda':
            # Use GPU for proximity calculation
            coords = torch.tensor(valid_merchant_df[['lat', 'lon']].values, device=device, dtype=torch.float32)
            
            # Process in batches if dataset is large
            merchant_edges = []
            n_merchants = len(valid_merchant_df)
            
            for i in range(0, n_merchants, batch_size):
                end_idx = min(i + batch_size, n_merchants)
                batch_coords = coords[i:end_idx]
                
                # Calculate pairwise distances using GPU
                # ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a*b
                a_norm = torch.sum(batch_coords**2, dim=1).view(-1, 1)
                b_norm = torch.sum(coords**2, dim=1).view(1, -1)
                dist_matrix = a_norm + b_norm - 2 * torch.mm(batch_coords, coords.t())
                dist_matrix = torch.sqrt(torch.clamp(dist_matrix, min=0))
                
                # Find close merchants
                close_pairs = torch.nonzero(dist_matrix < 0.01, as_tuple=False)
                
                # Add valid pairs to edges list
                for pair in close_pairs:
                    idx1, idx2 = pair[0].item() + i, pair[1].item()
                    if idx1 != idx2:  # Avoid self-loops
                        dist = dist_matrix[pair[0], pair[1]].item()
                        m1 = valid_merchant_df.iloc[idx1]['merchant_id']
                        m2 = valid_merchant_df.iloc[idx2]['merchant_id']
                        merchant_edges.append((m1, m2, {'edge_type': 'location_proximity', 'weight': 1-dist*100}))
            
            # Add all edges to graph
            G.add_edges_from(merchant_edges)
        
        else:
            # CPU-based implementation using scikit-learn's NearestNeighbors
            coords = valid_merchant_df[['lat', 'lon']].values
            
            # Use NearestNeighbors for efficient proximity search
            nbrs = NearestNeighbors(radius=0.01, algorithm='ball_tree').fit(coords)
            distances, indices = nbrs.radius_neighbors(coords)
            
            # Create edges for close merchants
            merchant_edges = []
            for i, idx_list in enumerate(indices):
                for j, dist in zip(idx_list, distances[i]):
                    if i != j:  # Avoid self-loops
                        m1 = valid_merchant_df.iloc[i]['merchant_id']
                        m2 = valid_merchant_df.iloc[j]['merchant_id']
                        merchant_edges.append((m1, m2, {'edge_type': 'location_proximity', 'weight': 1-dist*100}))
            
            # Add all edges to graph
            G.add_edges_from(merchant_edges)
    
    if verbose:
        print("Processing sequential transactions...")
    
    # Create a mapping from customer to their transactions
    customer_txn_map = {}
    for i, cust_id in enumerate(customer_ids):
        if cust_id not in customer_txn_map:
            customer_txn_map[cust_id] = []
        customer_txn_map[cust_id].append((transaction_ids[i], X_train.iloc[i]['timeDelta']))
    
    # Connect transactions from the same customer if they occurred within a short time window
    sequential_edges = []
    for cust_id, txns in customer_txn_map.items():
        if len(txns) > 1:
            # Sort transactions by time
            txns.sort(key=lambda x: x[1])
            
            # Connect sequential transactions within 24 hours
            for i in range(len(txns) - 1):
                for j in range(i + 1, len(txns)):
                    time_delta = abs(txns[j][1] - txns[i][1])
                    if time_delta < 86400:  # Within 24 hours (in seconds)
                        sequential_edges.append((txns[i][0], txns[j][0], {
                            'edge_type': 'sequential_transactions',
                            'time_diff': time_delta
                        }))
    
    # Add all sequential edges to graph
    G.add_edges_from(sequential_edges)
    
    if verbose:
        print("Adding fraud node connections...")
    
    # Add artificial fraud node and connect it to all fraudulent transactions
    fraud_node = "ARTIFICIAL_FRAUD_NODE"
    G.add_node(fraud_node, type='artificial_fraud')
    
    # Connect to fraudulent transactions
    fraud_edges = [(txn_id, fraud_node, {'edge_type': 'is_fraud'}) 
                  for i, txn_id in enumerate(transaction_ids) if y_train.iloc[i] == 1]
    G.add_edges_from(fraud_edges)
    
    if verbose:
        print(f"Network creation complete. Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    
    return G

# Optional: A utility function to estimate memory requirements
def estimate_network_memory(X_train, y_train):
    """
    Estimates the memory requirements for network creation
    """
    n_transactions = len(X_train)
    n_customers = X_train['customerId'].nunique()
    n_merchants = X_train[['merchantName', 'merchantCountryCode']].drop_duplicates().shape[0]
    
    # Estimate nodes (transactions, customers, merchants, fraud node)
    n_nodes = n_transactions + n_customers + n_merchants + 1
    
    # Estimate edges (transaction-customer, transaction-merchant, merchant proximity, sequential, fraud)
    n_edges_base = n_transactions * 2  # Each transaction connects to a customer and merchant
    n_fraud_edges = y_train.sum()
    
    # Rough estimate of merchant proximity edges (assuming 5% of merchants are close)
    n_merchant_proximity = int(n_merchants * n_merchants * 0.05)
    
    # Rough estimate of sequential transaction edges (assuming 10% of transactions per customer are sequential)
    avg_txn_per_customer = n_transactions / n_customers
    n_sequential = int(n_customers * (avg_txn_per_customer * (avg_txn_per_customer - 1) / 2) * 0.1)
    
    total_edges = n_edges_base + n_fraud_edges + n_merchant_proximity + n_sequential
    
    # Estimate memory (rough approximation)
    memory_per_node = 100  # bytes
    memory_per_edge = 60   # bytes
    
    estimated_memory = (n_nodes * memory_per_node + total_edges * memory_per_edge) / (1024 * 1024)  # in MB
    
    return {
        'n_nodes': n_nodes,
        'n_edges': total_edges,
        'estimated_memory_mb': estimated_memory,
        'recommended_gpu_memory_gb': max(2, int(estimated_memory / 1024 * 3))  # 3x buffer
    }

In [5]:
from gensim.models.callbacks import CallbackAny2Vec

def check_edgelist(edgelist):

    if not isinstance(edgelist, list):
        edgelist = list(edgelist)
    

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [6]:
class InductiveDeepwalk(BaseEstimator, TransformerMixin):
    """
    Implementation of DeepWalk with inductive capabilities for fraud detection.
    GPU-accelerated version.
    
    Parameters
    ----------
    dimensions : int
        Number of dimensions in the embeddings
    walk_len : int
        Length of each random walk
    walk_num : int
        Number of random walks per node
    epochs : int, default=5
        Number of training epochs
    workers : int, default=1
        Number of parallel workers
    window_size : int, default=5
        Context window size for Word2Vec
    verbose : int, default=0
        Verbosity level
    use_gpu : bool, default=True
        Whether to use GPU acceleration
    """
    def __init__(self, dimensions, walk_len, walk_num, epochs=5, workers=1, window_size=5, verbose=0, use_gpu=True):
        self.dimensions = dimensions
        self.walk_len = walk_len
        self.walk_num = walk_num
        self.epochs = epochs
        self.workers = workers
        self.window_size = window_size
        self.first_fit = True
        self.verbose = verbose
        self.use_gpu = use_gpu
        
        # Check if GPU is available
        if self.use_gpu:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            if self.verbose > 0:
                print(f"Using device: {self.device}")
            if self.device.type == 'cpu' and self.use_gpu:
                print("Warning: GPU requested but not available. Using CPU instead.")
        else:
            self.device = torch.device('cpu')
            if self.verbose > 0:
                print("Using CPU as requested.")
        
    def fit(self, X, y=None):
        """
        Fit the model with X.
        
        Parameters
        ----------
        X : pandas.DataFrame
            Training data
        y : pandas.Series
            Target values (fraud labels)
            
        Returns
        -------
        self : object
            Returns self
        """
        if self.verbose > 0:
            print("Parsing input into network format.")
        
        # Create network using the updated function
        self.G = create_network(X, y)
        
        # Get transaction nodes
        transaction_nodes = [n for n, d in self.G.nodes(data=True) if d.get('type') == 'transaction']
        
        # Extract transaction IDs as integers for proper indexing
        self.transaction_ids = [n.split('_')[1] for n in transaction_nodes]
        
        callbacks = []
        if self.verbose > 0:
            print("Running network representation algorithm.")
            epochlogger = EpochLogger()
            callbacks = [epochlogger]
        
        # Configure Word2Vec parameters with GPU support if available
        w2v_params = {
            'workers': self.workers, 
            'window': self.window_size, 
            'callbacks': callbacks,
            'compute_loss': True
        }
        
        # Train Node2Vec model
        g2v = Node2Vec(
            n_components=self.dimensions,
            walklen=self.walk_len,
            epochs=self.walk_num,
            verbose=self.verbose,
            w2vparams=w2v_params
        )
        g2v.fit(self.G)
        self.model = g2v.model
        
        # Create dictionary of node embeddings
        self.node_embeddings = {}
        for node in self.G.nodes():
            try:
                self.node_embeddings[node] = self.model.wv[node]
            except KeyError:
                # Handle nodes not in vocabulary
                self.node_embeddings[node] = np.zeros(self.dimensions)
        
        # Create array of transaction embeddings for easier access
        self.embeddings = np.zeros((len(transaction_nodes), self.dimensions))
        for i, txn_id in enumerate(transaction_nodes):
            self.embeddings[i] = self.node_embeddings[txn_id]
            
        # Convert embeddings to PyTorch tensors for GPU processing
        if self.use_gpu and self.device.type == 'cuda':
            self.embeddings_tensor = torch.tensor(self.embeddings, device=self.device)
        
        self.is_fitted_ = True
        self.first_fit = True
        return self
    
    def transform(self, X):
        """
        Transform X.
        
        Parameters
        ----------
        X : pandas.DataFrame
            Test data containing transaction features
            
        Returns
        -------
        np.ndarray
            Transaction embeddings
        """
        check_is_fitted(self, 'is_fitted_')
        
        if self.first_fit:
            if self.verbose > 0:
                print("Retrieving embeddings for training data.")
            results = self.embeddings
            self.first_fit = False
        else:
            if self.verbose > 0:
                print("Running inductive pooling extension.")
            results = inductive_pooling(X, self.embeddings, self.G, workers=self.workers, 
                                       use_gpu=self.use_gpu, device=self.device)
        
        return results

In [7]:
from sklearn.pipeline import Pipeline
import xgboost as xgb
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Parameters
dimensions = 32
walk_len = 80
walk_num = 10
window_size = 5
# the 'workers' parameter is used for multi-processing.
workers = 4

## Load Data

In [9]:
import pandas as pd
### PATH TO DEMO DATA ###
demo_data_path = '/kaggle/input/latlondata/output.csv'

df = pd.read_csv(demo_data_path)

# Convert date columns to datetime format
date_columns = ['transactionDateTime', 'currentExpDate', 'accountOpenDate', 'dateOfLastAddressChange']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [10]:
df.dropna(subset=['merchant_lat'], inplace=True)

In [11]:
df.shape

(303005, 33)

In [12]:
import cudf
import cupy as cp
import pandas as pd
from cuml.metrics import pairwise_distances
import numpy as np
from datetime import datetime

# Convert pandas DataFrame to cuDF DataFrame
print("Converting DataFrame to GPU...")
try:
    # If df is already in memory as pandas DataFrame
    df_gpu = cudf.DataFrame.from_pandas(df)
except NameError:
    # If you're loading from file directly
    # df_gpu = cudf.read_csv('your_transaction_data.csv', parse_dates=['transactionDateTime', 'accountOpenDate', 'dateOfLastAddressChange'])
    print("Error: DataFrame 'df' not found")

Converting DataFrame to GPU...


In [None]:
# -------------- GPU-Accelerated Feature Engineering --------------
import cudf
import cupy as cp
import pandas as pd
from cuml.metrics import pairwise_distances
import numpy as np
from datetime import datetime


print("Engineering transaction velocity features...")
# Sort transactions by customer and datetime
df_gpu = df_gpu.sort_values(['customerId', 'transactionDateTime'])

# Calculate time difference between consecutive transactions per customer
df_gpu['prevTransactionTime'] = df_gpu.groupby('customerId')['transactionDateTime'].shift(1)
df_gpu['timeDelta'] = (df_gpu['transactionDateTime'] - df_gpu['prevTransactionTime']).dt.total_seconds() / 3600  # in hours

# Count transactions in timeframes using GPU-accelerated approach
def count_transactions_in_timeframe_gpu(df, hours):
    # Create a GPU DataFrame with customer ID, transaction time, and a row number
    temp_df = cudf.DataFrame()
    temp_df['customerId'] = df['customerId']
    temp_df['transactionDateTime'] = df['transactionDateTime']
    
    # For each transaction, calculate the time window start
    timeframe_start = df['transactionDateTime'] - pd.Timedelta(hours=hours)
    
    # Use GPU to count transactions in time window for each row
    result = cudf.Series(cp.zeros(len(df), dtype=int))
    
    # Process in chunks to avoid memory issues on GPU
    chunk_size = 1000000  # Adjust based on your GPU memory
    
    for i in range(0, len(df), chunk_size):
        end_idx = min(i + chunk_size, len(df))
        chunk_customers = df['customerId'].iloc[i:end_idx]
        chunk_times = df['transactionDateTime'].iloc[i:end_idx]
        chunk_starts = timeframe_start.iloc[i:end_idx]
        
        # For each row in the chunk
        for j in range(len(chunk_customers)):
            # Get customer ID and time window
            cust_id = chunk_customers.iloc[j]
            cur_time = chunk_times.iloc[j]
            start_time = chunk_starts.iloc[j]
            
            # Count transactions for this customer in the time window
            count = len(df[(df['customerId'] == cust_id) & 
                          (df['transactionDateTime'] > start_time) &
                          (df['transactionDateTime'] < cur_time)])
            
            result[i + j] = count
    
    return result

# Apply counting functions - note: for large datasets, this is still computationally intensive
# For production, consider a more optimized windowing approach
print("Counting transactions in timeframes (this may take some time)...")
df_gpu['txn_count_24h'] = count_transactions_in_timeframe_gpu(df_gpu, 24)
df_gpu['txn_count_7d'] = count_transactions_in_timeframe_gpu(df_gpu, 168)  # 7*24=168

Engineering transaction velocity features...
Counting transactions in timeframes (this may take some time)...


In [None]:
# 2. Unusual Spending Spikes
print("Engineering spending pattern features...")
# Calculate average transaction amount per customer
customer_avg_amount = df_gpu.groupby('customerId')['transactionAmount'].transform('mean')
customer_std_amount = df_gpu.groupby('customerId')['transactionAmount'].transform('std')

# Calculate z-score of transaction amount
df_gpu['amount_zscore'] = (df_gpu['transactionAmount'] - customer_avg_amount) / customer_std_amount.fillna(1)
# Calculate ratio of current transaction to average
df_gpu['amount_to_avg_ratio'] = df_gpu['transactionAmount'] / customer_avg_amount.fillna(1)

# Calculate cumulative amount spent in last 24 hours - similar approach as transaction count
def sum_amount_in_timeframe_gpu(df, hours):
    temp_df = cudf.DataFrame()
    temp_df['customerId'] = df['customerId']
    temp_df['transactionDateTime'] = df['transactionDateTime']
    temp_df['transactionAmount'] = df['transactionAmount']
    
    # For each transaction, calculate the time window start
    timeframe_start = df['transactionDateTime'] - pd.Timedelta(hours=hours)
    
    # Use GPU to calculate sum in time window for each row
    result = cudf.Series(cp.zeros(len(df), dtype=float))
    
    # Process in chunks to avoid memory issues on GPU
    chunk_size = 1000000  # Adjust based on your GPU memory
    
    for i in range(0, len(df), chunk_size):
        end_idx = min(i + chunk_size, len(df))
        chunk_customers = df['customerId'].iloc[i:end_idx]
        chunk_times = df['transactionDateTime'].iloc[i:end_idx]
        chunk_starts = timeframe_start.iloc[i:end_idx]
        
        # For each row in the chunk
        for j in range(len(chunk_customers)):
            # Get customer ID and time window
            cust_id = chunk_customers.iloc[j]
            cur_time = chunk_times.iloc[j]
            start_time = chunk_starts.iloc[j]
            
            # Sum amounts for this customer in the time window
            amount_sum = df[(df['customerId'] == cust_id) & 
                           (df['transactionDateTime'] > start_time) &
                           (df['transactionDateTime'] < cur_time)]['transactionAmount'].sum()
            
            result[i + j] = amount_sum
    
    return result

print("Calculating spending amounts in timeframes...")
df_gpu['amount_24h'] = sum_amount_in_timeframe_gpu(df_gpu, 24)

In [None]:
# print("Engineering geographic features...")
# Fill previous valid latitude and longitude recursively using ffill()
df_gpu['prev_lat'] = df_gpu.groupby('customerId')['merchant_lat'].ffill().shift(1)
df_gpu['prev_lon'] = df_gpu.groupby('customerId')['merchant_lon'].ffill().shift(1)

In [None]:
def haversine_distance_gpu(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth specified in decimal degrees using GPU acceleration
    """
    # Convert decimal degrees to radians 
    lat1, lon1, lat2, lon2 = map(cp.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = cp.sin(dlat/2)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon/2)**2
    c = 2 * cp.arcsin(cp.sqrt(a)) 
    r = 6371  # Radius of earth in kilometers
    return c * r

# Apply GPU-accelerated distance calculation
distances = cp.zeros(len(df_gpu))

# Fix: Use .to_cupy() instead of .to_array() for cuDF Series
mask = (~df_gpu['prev_lat'].isna() & ~df_gpu['prev_lon'].isna() & 
        ~df_gpu['merchant_lat'].isna() & ~df_gpu['merchant_lon'].isna())

if mask.any():
    # Convert the mask to a cupy array
    mask_array = mask.to_cupy()
    valid_indices = cp.where(mask_array)[0]
    
    # Convert Series to cupy arrays
    prev_lat = df_gpu['prev_lat'].iloc[valid_indices].to_cupy()
    prev_lon = df_gpu['prev_lon'].iloc[valid_indices].to_cupy()
    merchant_lat = df_gpu['merchant_lat'].iloc[valid_indices].to_cupy()
    merchant_lon = df_gpu['merchant_lon'].iloc[valid_indices].to_cupy()
    
    distances[valid_indices] = haversine_distance_gpu(
        prev_lat, prev_lon, merchant_lat, merchant_lon
    )

df_gpu['distance_from_prev_txn'] = distances

# Calculate speed (km/h) - distance divided by time difference
# Convert time_delta to cupy array
df_gpu['timeDelta'] = df_gpu['timeDelta'].fillna(0)
time_delta_array = df_gpu['timeDelta'].to_cupy()
speeds = cp.zeros_like(time_delta_array)
valid_time = time_delta_array > 0
speeds[valid_time] = distances[valid_time] / time_delta_array[valid_time]
df_gpu['speed_kmph'] = speeds

# Calculate if transaction is in a different country from previous
df_gpu['prev_country'] = df_gpu.groupby('customerId')['merchantCountryCode'].shift(1)
df_gpu['different_country'] = (df_gpu['merchantCountryCode'] != df_gpu['prev_country']).astype('int32')

In [None]:
# 4. Additional Features
print("Engineering additional features...")
# Binary flags
df_gpu['cvv_match'] = (df_gpu['cardCVV'] == df_gpu['enteredCVV']).astype('int32')
df_gpu['exp_date_match'] = df_gpu['expirationDateKeyInMatch'].astype('int32')
df_gpu['is_foreign_transaction'] = (df_gpu['acqCountry'] != df_gpu['merchantCountryCode']).astype('int32')

# Calculate the ratio of transaction amount to credit limit
df_gpu['amount_to_limit_ratio'] = df_gpu['transactionAmount'] / df_gpu['creditLimit'].fillna(1)

# Calculate the ratio of transaction amount to available money
df_gpu['amount_to_available_ratio'] = df_gpu['transactionAmount'] / df_gpu['availableMoney'].fillna(1)

# Calculate days since account opening
df_gpu['account_age_days'] = (df_gpu['transactionDateTime'] - df_gpu['accountOpenDate']).dt.days

# Calculate days since last address change
df_gpu['days_since_address_change'] = (df_gpu['transactionDateTime'] - df_gpu['dateOfLastAddressChange']).dt.days

# Is online transaction
df_gpu['isOnline'] = (df_gpu['merchantCategoryCode'] == 'online_retail').astype('int32')

# One-hot encode categorical variables using GPU
print("One-hot encoding categorical variables...")
categorical_cols = ['posEntryMode', 'posConditionCode', 'merchantCategoryCode', 'transactionType']
for col in categorical_cols:
    dummies = cudf.get_dummies(df_gpu[col], prefix=col, drop_first=True)
    df_gpu = df_gpu.join(dummies)



In [None]:
df_gpu.columns

In [None]:
categorical_cols

In [None]:

print("GPU-accelerated feature engineering complete!")

In [None]:
df_gpu = df_gpu[[col for col in df_gpu.columns if col != 'isFraud'] + ['isFraud']]

In [None]:
df_gpu.shape

In [None]:
from sklearn.model_selection import train_test_split 

# Split into train and test set
df_train = df_gpu.iloc[:300000]
df_test = df_gpu.iloc[300000:]

In [None]:
X_train=df_train.iloc[:, :-1]
y_train=df_train.iloc[:, -1]

In [None]:
print('Converted Training Data')

In [None]:
# %load /usr/local/lib/python3.10/dist-packages/nodevectors/node2vec.py
import numba
import numpy as np
import pandas as pd
import time
import warnings

# Gensim triggers automatic useless warnings for windows users...
warnings.simplefilter("ignore", category=UserWarning)
import gensim
warnings.simplefilter("default", category=UserWarning)


import csrgraph as cg
from nodevectors.embedders import BaseNodeEmbedder

class Node2Vec(BaseNodeEmbedder):
    def __init__(
        self, 
        n_components=32,
        walklen=30, 
        epochs=20,
        return_weight=1.,
        neighbor_weight=1.,
        threads=0, 
        keep_walks=False,
        verbose=True,
        w2vparams={"window":10, "negative":5, "iter":10,
                   "batch_words":128}):
        """
        Parameters
        ----------
        walklen : int
            length of the random walks
        epochs : int
            number of times to start a walk from each nodes
        threads : int
            number of threads to use. 0 is full use
        n_components : int
            number of resulting dimensions for the embedding
            This should be set here rather than in the w2vparams arguments
        return_weight : float in (0, inf]
            Weight on the probability of returning to node coming from
            Having this higher tends the walks to be 
            more like a Breadth-First Search.
            Having this very high  (> 2) makes search very local.
            Equal to the inverse of p in the Node2Vec paper.
        neighbor_weight : float in (0, inf]
            Weight on the probability of visitng a neighbor node
            to the one we're coming from in the random walk
            Having this higher tends the walks to be 
            more like a Depth-First Search.
            Having this very high makes search more outward.
            Having this very low makes search very local.
            Equal to the inverse of q in the Node2Vec paper.
        keep_walks : bool
            Whether to save the random walks in the model object after training
        w2vparams : dict
            dictionary of parameters to pass to gensim's word2vec
            Don't set the embedding dimensions through arguments here.
        """
        if type(threads) is not int:
            raise ValueError("Threads argument must be an int!")
        if walklen < 1 or epochs < 1:
            raise ValueError("Walklen and epochs arguments must be > 1")
        self.n_components = n_components
        self.walklen = walklen
        self.epochs = epochs
        self.keep_walks = keep_walks
        if 'size' in w2vparams.keys():
            raise AttributeError("Embedding dimensions should not be set "
                + "through w2v parameters, but through n_components")
        self.w2vparams = w2vparams
        self.return_weight = return_weight
        self.neighbor_weight = neighbor_weight
        if threads == 0:
            threads = numba.config.NUMBA_DEFAULT_NUM_THREADS
        self.threads = threads
        w2vparams['workers'] = threads
        self.verbose = verbose

    def fit(self, G):
        """
        NOTE: Currently only support str or int as node name for graph
        Parameters
        ----------
        G : graph data
            Graph to embed
            Can be any graph type that's supported by csrgraph library
            (NetworkX, numpy 2d array, scipy CSR matrix, CSR matrix components)
        """
        if not isinstance(G, cg.csrgraph):
            G = cg.csrgraph(G, threads=self.threads)
        if G.threads != self.threads:
            G.set_threads(self.threads)
        # Because networkx graphs are actually iterables of their nodes
        #   we do list(G) to avoid networkx 1.X vs 2.X errors
        node_names = G.names
        if type(node_names[0]) not in [int, str, np.int32, np.uint32, 
                                       np.int64, np.uint64]:
            raise ValueError("Graph node names must be int or str!")
        # Adjacency matrix
        walks_t = time.time()
        if self.verbose:
            print("Making walks...", end=" ")
        self.walks = G.random_walks(walklen=self.walklen, 
                                    epochs=self.epochs,
                                    return_weight=self.return_weight,
                                    neighbor_weight=self.neighbor_weight)
        if self.verbose:
            print(f"Done, T={time.time() - walks_t:.2f}")
            print("Mapping Walk Names...", end=" ")
        map_t = time.time()
        self.walks = pd.DataFrame(self.walks)
        # Map nodeId -> node name
        node_dict = dict(zip(np.arange(len(node_names)), node_names))
        for col in self.walks.columns:
            self.walks[col] = self.walks[col].map(node_dict).astype(str)
        # Somehow gensim only trains on this list iterator
        # it silently mistrains on array input
        self.walks = [list(x) for x in self.walks.itertuples(False, None)]
        if self.verbose:
            print(f"Done, T={time.time() - map_t:.2f}")
            print("Training W2V...", end=" ")
            if gensim.models.word2vec.FAST_VERSION < 1:
                print("WARNING: gensim word2vec version is unoptimized"
                    "Try version 3.6 if on windows, versions 3.7 "
                    "and 3.8 have had issues")
        w2v_t = time.time()
        # Train gensim word2vec model on random walks
        self.model = gensim.models.Word2Vec(
            sentences=self.walks,
            vector_size=self.n_components,
            **self.w2vparams)
        if not self.keep_walks:
            del self.walks
        if self.verbose:
            print(f"Done, T={time.time() - w2v_t:.2f}")

    def fit_transform(self, G):
        """
        NOTE: Currently only support str or int as node name for graph
        Parameters
        ----------
        G : graph data
            Graph to embed
            Can be any graph type that's supported by csrgraph library
            (NetworkX, numpy 2d array, scipy CSR matrix, CSR matrix components)
        """
        if not isinstance(G, cg.csrgraph):
            G = cg.csrgraph(G, threads=self.threads)
        self.fit(G)
        w = np.array(
            pd.DataFrame.from_records(
            pd.Series(np.arange(len(G.nodes())))
              .apply(self.predict)
              .values)
        )
        return w
    
    def predict(self, node_name):
        """
        Return vector associated with node
        node_name : str or int
            either the node ID or node name depending on graph format
        """
        # current hack to work around word2vec problem
        # ints need to be str -_-
        if type(node_name) is not str:
            node_name = str(node_name)
        return self.model.wv.__getitem__(node_name)

    def save_vectors(self, out_file):
        """
        Save as embeddings in gensim.models.KeyedVectors format
        """
        self.model.wv.save_word2vec_format(out_file)

    def load_vectors(self, out_file):
        """
        Load embeddings from gensim.models.KeyedVectors format
        """
        self.model = gensim.wv.load_word2vec_format(out_file)

In [None]:
import multiprocessing

workers=multiprocessing.cpu_count()
embedder = InductiveDeepwalk(dimensions=dimensions, walk_len = walk_len, walk_num=walk_num, workers=workers, verbose=0)

In [None]:
X_train=X_train.to_pandas()

In [None]:
y_train=y_train.to_pandas()

In [None]:
print('Starting embedding')
embedobj=embedder.fit(X_train, y_train)
print('ending embedding')

In [None]:
params={'eval_metric' : ['auc','aucpr', 'logloss'],
                          'n_estimators':300, 
                          'n_jobs':8, 
                          'learning_rate':0.1, 
                          'seed':42, 
                          'colsample_bytree' : 0.6,
                          'colsample_bylevel':0.9, 
                          'subsample' : 0.9}

In [None]:
np.save('embeddings.npy',embedobj.embeddings)

In [None]:
embedobj.embeddings.shape

In [None]:
classifier = xgb.XGBClassifier(**params)

In [None]:
classifier.fit(embedobj.embeddings[:300000],y_train[:300000])

In [None]:
y_pred_proba = classifier.predict_proba(embedobj.embeddings[300000:])

In [None]:
preds=y_pred_proba[:, 1]

In [None]:
preds.shape

## Evaluation

In [None]:
from sklearn.metrics import confusion_matrix

# Assume y_proba[:, 1] contains probabilities for class 1
y_pred = (preds >= 0.5).astype(int)  # Convert to binary labels (0 or 1)

# Compute confusion matrix
cm = confusion_matrix(y_train[300000:], y_pred)

print(cm)