In [1]:
!pip install pandas kagglehub seaborn matplotlib scikit-learn torch lightly

Collecting kagglehub
  Using cached kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Collecting lightly
  Using cached lightly-1.5.20-py3-none-any.whl.metadata (37 kB)
Collecting hydra-core>=1.0.0 (from lightly)
  Using cached hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting lightly_utils~=0.0.0 (from lightly)
  Using cached lightly_utils-0.0.2-py3-none-any.whl.metadata (1.4 kB)
Collecting pytorch_lightning>=1.0.4 (from lightly)
  Using cached pytorch_lightning-2.5.1.post0-py3-none-any.whl.metadata (20 kB)
Collecting aenum>=3.1.11 (from lightly)
  Using cached aenum-3.1.16-py3-none-any.whl.metadata (3.8 kB)
Collecting omegaconf<2.4,>=2.2 (from hydra-core>=1.0.0->lightly)
  Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from hydra-core>=1.0.0->lightly)
  Using cached antlr4_python3_runtime-4.9.3-py3-none-any.whl
Collecting torchmetrics>=0.7.0 (from pytorch_lightning>=1.0.4->lightly)
  Using cached torchmetrics-1.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


from lightly.loss import NTXentLoss


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import time
import gc 

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}" )

Using device: cuda


In [4]:
N_FEATURES = 35 #166  # Time step + 165 features
EMBEDDING_DIM = 64
ENCODER_EMBEDDING_DIM = 64
PROJECTION_DIM = 16
BATCH_SIZE = 128
EPOCHS = 1
LEARNING_RATE = 1e-3 
TEMPERATURE = 0.1

AUG_NOISE_LEVEL = 0.03
AUG_MASK_FRACTION = 0.15

In [5]:
dataset = pd.read_csv("LI-Small_Trans.csv")

In [6]:
print(dataset.head)

<bound method NDFrame.head of                 Timestamp  From Bank    Account  To Bank  Account.1  \
0        2022/09/01 00:08         11  8000ECA90       11  8000ECA90   
1        2022/09/01 00:21       3402  80021DAD0     3402  80021DAD0   
2        2022/09/01 00:00         11  8000ECA90     1120  8006AA910   
3        2022/09/01 00:16       3814  8006AD080     3814  8006AD080   
4        2022/09/01 00:00         20  8006AD530       20  8006AD530   
...                   ...        ...        ...      ...        ...   
6924044  2022/09/10 23:39      71696  81B2518F1    71528  81C0482E1   
6924045  2022/09/10 23:48     271241  81B567481   173457  81C0DA751   
6924046  2022/09/10 23:50     271241  81B567481   173457  81C0DA751   
6924047  2022/09/10 23:57     170558  81A2206B1   275798  81C1D5CA1   
6924048  2022/09/10 23:31     170558  81A2206B1   275798  81C1D5CA1   

         Amount Received Receiving Currency   Amount Paid Payment Currency  \
0           3.195403e+06          US Do

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_for_ssl(df_input):
    """
    Prepares the IBM AML transaction data for self-supervised learning.
    Converts all features to numerical format and scales them.
    The 'Is Laundering' column is dropped from the features.
    """
    print("Starting preprocessing...")
    df = df_input.copy()

    if 'Is Laundering' in df.columns:
        print("Dropping 'Is Laundering' column for SSL feature preparation.")
        df = df.drop('Is Laundering', axis=1)

    # 1. Timestamp Processing
    print("Processing Timestamp...")
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') # Coerce errors
    df.dropna(subset=['Timestamp'], inplace=True) # Drop rows where timestamp couldn't be parsed

    df['Time_Hour'] = df['Timestamp'].dt.hour
    df['Time_Minute'] = df['Timestamp'].dt.minute
    df['Time_DayOfWeek'] = df['Timestamp'].dt.dayofweek
    df['Time_DayOfMonth'] = df['Timestamp'].dt.day
    df['Time_Month'] = df['Timestamp'].dt.month

    df['Hour_sin'] = np.sin(2 * np.pi * df['Time_Hour'] / 24.0)
    df['Hour_cos'] = np.cos(2 * np.pi * df['Time_Hour'] / 24.0)
    df['Minute_sin'] = np.sin(2 * np.pi * df['Time_Minute'] / 60.0)
    df['Minute_cos'] = np.cos(2 * np.pi * df['Time_Minute'] / 60.0)
    df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['Time_DayOfWeek'] / 7.0)
    df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['Time_DayOfWeek'] / 7.0)
    df['Month_sin'] = np.sin(2 * np.pi * df['Time_Month'] / 12.0)
    df['Month_cos'] = np.cos(2 * np.pi * df['Time_Month'] / 12.0)
    df = df.drop(['Timestamp', 'Time_Hour', 'Time_Minute', 'Time_DayOfWeek', 'Time_DayOfMonth', 'Time_Month'], axis=1)

    # 2. Account Number Processing
    print("Processing Account Numbers...")
    def hex_to_int_safe(hex_str):
        try: return int(str(hex_str), 16)
        except: return -1 # Use a numeric placeholder
    df['Account_Num'] = df['Account'].apply(hex_to_int_safe)
    df['Account.1_Num'] = df['Account.1'].apply(hex_to_int_safe)
    df = df.drop(['Account', 'Account.1'], axis=1)

    # 3. Amount Processing
    print("Processing Amounts...")
    amount_col_to_use = 'Amount Received'
    currency_col_to_use = 'Receiving Currency'

    # Ensure amount columns are numeric, coercing errors
    df['Amount Received'] = pd.to_numeric(df['Amount Received'], errors='coerce')
    df['Amount Paid'] = pd.to_numeric(df['Amount Paid'], errors='coerce')
    df.dropna(subset=['Amount Received', 'Amount Paid'], inplace=True) # Drop rows with non-numeric amounts

    if (df['Amount Received'] == df['Amount Paid']).all() and \
       (df['Receiving Currency'] == df['Payment Currency']).all():
        df = df.drop(['Amount Paid', 'Payment Currency'], axis=1)
    else:
        print("Warning: Amount/Currency pairs are not always identical. Dropping 'Amount Paid' & 'Payment Currency'.")
        if 'Amount Paid' in df.columns: df = df.drop('Amount Paid', axis=1)
        if 'Payment Currency' in df.columns: df = df.drop('Payment Currency', axis=1)

    df['Amount_Log'] = np.log1p(df[amount_col_to_use])
    df = df.drop([amount_col_to_use], axis=1)

    # 4. Categorical String Features: Currency and Payment Format
    print("Processing Categorical String Features (Currency, Payment Format)...")
    categorical_to_encode = [currency_col_to_use, 'Payment Format']
    # Ensure these columns are strings before get_dummies
    for col in categorical_to_encode:
        if col in df.columns:
            df[col] = df[col].astype(str) # Convert to string to handle mixed types or NaNs gracefully

    df = pd.get_dummies(df, columns=categorical_to_encode, prefix=['Currency', 'Format'], dummy_na=False, dtype=float) # Use float for dummies

    # --- Intermediate Check: Ensure all columns are numeric before scaling ---
    print("\nData types before final scaling:")
    print(df.dtypes)

    # Identify any remaining object columns
    object_cols = df.select_dtypes(include='object').columns
    if len(object_cols) > 0:
        print(f"Warning: Found object columns after get_dummies: {object_cols.tolist()}")
        print("Attempting to convert them to numeric, or dropping them if conversion fails.")
        for col in object_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce') # Try to convert
        df.dropna(inplace=True) # Drop rows where conversion created NaNs if critical

    # Re-check after attempted conversion
    final_object_cols = df.select_dtypes(include='object').columns
    if len(final_object_cols) > 0:
        print(f"ERROR: Still have object columns: {final_object_cols.tolist()}. These will cause issues.")
        print("Problematic columns' unique values (first few):")
        for col in final_object_cols:
            print(f"Column '{col}': {df[col].unique()[:5]}")
        # Decide how to handle: drop them, or fix their conversion
        print(f"Dropping problematic object columns: {final_object_cols.tolist()}")
        df = df.drop(columns=final_object_cols)


    # 5. Scaling all numerical features
    print("\nScaling all numerical features...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if not numerical_cols:
        print("Error: No numerical columns found to scale. Check preprocessing steps.")
        return np.array([]) # Or raise an error

    print(f"Columns to be scaled: {numerical_cols}")
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    print(f"\nPreprocessing finished. Final feature shape: {df.shape}")
    print("Final data types after all processing:")
    print(df.dtypes)
    print("Sample of processed data (head):")
    print(df.head())

    return df.values.astype(np.float32) # Explicitly cast to float32

In [8]:
class YourAugmentedDataset(Dataset):
    def __init__(self, features_np_array, noise_level=0.05, mask_fraction=0.15):
        if features_np_array.dtype != np.float32: features_np_array = features_np_array.astype(np.float32)
        self.features = torch.tensor(features_np_array, dtype=torch.float32)
        self.noise_level = noise_level; self.mask_fraction = mask_fraction
        self.n_features_dim = features_np_array.shape[1]
    def __len__(self): return len(self.features)
    def _augment(self, x_original):
        x = x_original.clone()
        if self.noise_level > 0: x += torch.randn_like(x) * self.noise_level
        if self.mask_fraction > 0: x *= (torch.rand(self.n_features_dim, device=x.device) > self.mask_fraction).float()
        return x
    def __getitem__(self, idx):
        original_x = self.features[idx]; view1 = self._augment(original_x); view2 = self._augment(original_x)
        return view1, view2

In [9]:
class SimpleDataset(Dataset):
    def __init__(self, features_np_array):
        if features_np_array.dtype != np.float32: features_np_array = features_np_array.astype(np.float32)
        self.features = torch.tensor(features_np_array, dtype=torch.float32)
    def __len__(self): return len(self.features)
    def __getitem__(self, idx): return self.features[idx]

In [None]:
df_raw = pd.read_csv("LI-Small_Trans.csv")

X_processed_features_np = preprocess_for_ssl(df_raw)

# Check the dtype of the resulting NumPy array
if X_processed_features_np.size > 0: 
    print(f"\nDtype of X_processed_features_np: {X_processed_features_np.dtype}")
    if X_processed_features_np.dtype == np.object_:
        print("ERROR: X_processed_features_np still has object dtype after preprocessing!")
    else:
        print("SUCCESS: X_processed_features_np is now a numerical type.")
else:
    print("Warning: Preprocessing resulted in an empty array. Check for excessive NaN dropping.")

# Now try creating your dataset again
if X_processed_features_np.size > 0 and X_processed_features_np.dtype != np.object_:
    # Make sure YourAugmentedDataset and torch are defined
    import torch
    from torch.utils.data import Dataset # Assuming YourAugmentedDataset inherits from this

    # (Paste your YourAugmentedDataset class definition here if not already defined)
    class YourAugmentedDataset(Dataset):
        def __init__(self, features_np_array, noise_level=0.05, mask_fraction=0.15):
            # Ensure features_np_array is float32 before converting to tensor
            if features_np_array.dtype != np.float32:
                print(f"Warning: features_np_array dtype is {features_np_array.dtype}, converting to np.float32.")
                features_np_array = features_np_array.astype(np.float32)
            
            self.features = torch.tensor(features_np_array, dtype=torch.float32)
            self.noise_level = noise_level
            self.mask_fraction = mask_fraction
            self.n_samples, self.n_features_dim = features_np_array.shape
            print(f"Augmented Dataset: {self.n_samples} samples, {self.n_features_dim} features.")
            print(f"Augmentation Config: Noise Level={self.noise_level}, Mask Fraction={self.mask_fraction}")

        def __len__(self):
            return len(self.features)

        def _augment(self, x_original):
            x = x_original.clone()
            if self.noise_level > 0:
                noise = torch.randn_like(x) * self.noise_level
                x = x + noise
            if self.mask_fraction > 0:
                mask = (torch.rand(self.n_features_dim, device=x.device) > self.mask_fraction).float()
                x = x * mask
            return x

        def __getitem__(self, idx):
            original_x = self.features[idx]
            view1 = self._augment(original_x)
            view2 = self._augment(original_x)
            return view1, view2

    augmented_dataset = YourAugmentedDataset(
        features_np_array=X_processed_features_np,
        noise_level=0.05,
        mask_fraction=0.15
    )
    print("Dataset created successfully.")
else:
    print("Skipping dataset creation due to preprocessing issues or empty array.")

Starting preprocessing...
Dropping 'Is Laundering' column for SSL feature preparation.
Processing Timestamp...
Processing Account Numbers...
Processing Amounts...
Processing Categorical String Features (Currency, Payment Format)...

Data types before final scaling:
From Bank                       int64
To Bank                         int64
Hour_sin                      float64
Hour_cos                      float64
Minute_sin                    float64
Minute_cos                    float64
DayOfWeek_sin                 float64
DayOfWeek_cos                 float64
Month_sin                     float64
Month_cos                     float64
Account_Num                     int64
Account.1_Num                   int64
Amount_Log                    float64
Currency_Australian Dollar    float64
Currency_Bitcoin              float64
Currency_Brazil Real          float64
Currency_Canadian Dollar      float64
Currency_Euro                 float64
Currency_Mexican Peso         float64
Currency_Rub

In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import math # Keep for now, though not strictly needed for length calculation without pooling

# class CNNGRUEncoder(nn.Module):
#     def __init__(self, input_dim, cnn_channels1=32, cnn_channels2=64, cnn_channels3=128, cnn_channels4=256,
#                  kernel_size=9, # pool_kernel is no longer used
#                  gru_hidden_size=128, gru_layers=4, bidirectional=True,
#                  embedding_dim=64):
#         super(CNNGRUEncoder, self).__init__()
#         self.input_dim = input_dim # This is N_FEATURES_PROC

#         # CNN Layers (No Pooling)
#         self.conv1 = nn.Conv1d(1, cnn_channels1, kernel_size, padding='same')
#         self.relu1 = nn.ReLU()
#         # self.pool1 = nn.MaxPool1d(pool_kernel) # Commented out

#         self.conv2 = nn.Conv1d(cnn_channels1, cnn_channels2, kernel_size, padding='same')
#         self.relu2 = nn.ReLU()
#         # self.pool2 = nn.MaxPool1d(pool_kernel) # Commented out

#         self.conv3 = nn.Conv1d(cnn_channels2, cnn_channels3, kernel_size, padding='same')
#         self.relu3 = nn.ReLU()
#         # self.pool3 = nn.MaxPool1d(pool_kernel) # Commented out

#         self.conv4 = nn.Conv1d(cnn_channels3, cnn_channels4, kernel_size, padding='same')
#         self.relu4 = nn.ReLU()
#         # self.pool4 = nn.MaxPool1d(pool_kernel) # Assuming you might have had a 4th pool, also commented

#         # Since padding='same' and stride=1 (default for Conv1d) and no pooling,
#         # the sequence length remains self.input_dim throughout the CNNs.
#         self.cnn_output_length = self.input_dim
#         self.cnn_output_channels = cnn_channels4 # Channels from the last conv layer

#         print(f"CNN Configuration: Input Seq Length = {self.input_dim}")
#         print(f"CNN Output (to GRU): Sequence Length = {self.cnn_output_length}, Features per step (Channels) = {self.cnn_output_channels}")

#         # GRU Layer
#         # Input to GRU: (batch_size, sequence_length=self.cnn_output_length, features_per_step=self.cnn_output_channels)
#         self.gru = nn.GRU(
#             input_size=self.cnn_output_channels, # Features per time step from CNN
#             hidden_size=gru_hidden_size,
#             num_layers=gru_layers,
#             batch_first=True,
#             bidirectional=bidirectional
#         )
        
#         # Output FC Layer
#         gru_output_dim_for_fc = gru_hidden_size * (2 if bidirectional else 1)
#         self.fc_out = nn.Linear(gru_output_dim_for_fc, embedding_dim)
        
#     def forward(self, x):
#         # Input x: (batch_size, N_FEATURES_PROC)
#         # Reshape for Conv1d: (batch_size, in_channels=1, sequence_length=N_FEATURES_PROC)
#         x = x.unsqueeze(1) # x is now (batch_size, 1, self.input_dim)

#         # Pass through CNN layers
#         x = self.relu1(self.conv1(x))
#         x = self.relu2(self.conv2(x))
#         x = self.relu3(self.conv3(x))
#         x = self.relu4(self.conv4(x))
#         # x is now (batch_size, self.cnn_output_channels, self.input_dim)

#         # Prepare for GRU: (batch_size, sequence_length, features_per_step)
#         # Current x: (batch_size, channels, sequence_length_after_cnn)
#         # Need to permute to: (batch_size, sequence_length_after_cnn, channels)
#         x = x.permute(0, 2, 1) # x is now (batch_size, self.input_dim, self.cnn_output_channels)

#         # Pass through GRU
#         # self.gru.flatten_parameters() # Good practice if using DataParallel/DDP, or if you see warnings
#         _, h_n = self.gru(x) # out_gru shape: (batch, seq_len, num_directions * hidden_size)
#                              # h_n shape: (num_layers * num_directions, batch, hidden_size)
        
#         # Extract the last hidden state (or combined last hidden states for bidirectional)
#         if self.gru.bidirectional:
#             # h_n is (num_layers*2, batch, hidden_size)
#             # Last forward is h_n[-2,:,:] and last backward is h_n[-1,:,:]
#             # These are from the *last layer* of the GRU stack.
#             gru_out_for_fc = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
#         else:
#             # h_n is (num_layers, batch, hidden_size)
#             # Last hidden state from the last layer.
#             gru_out_for_fc = h_n[-1,:,:]
        
#         # Pass through output FC layer
#         embedding = self.fc_out(gru_out_for_fc) # gru_out_for_fc is (batch_size, fc_in_features)
        
#         # Normalize the final embedding (L2 normalization)
#         embedding = F.normalize(embedding, p=2, dim=1)
        
#         return embedding

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransactionFeatureCNN(nn.Module):
    """
    CNN to process features of a SINGLE transaction and produce an embedding for it.
    This is the "inner" CNN.
    """
    def __init__(self, num_input_features_per_transaction, # N_FEATURES_PROC
                 cnn_channels=[32, 64, 128],       # Channels for CNN layers
                 kernel_sizes=[5, 5, 3],           # Kernel sizes
                 output_embedding_dim=128):        # Embedding size for one transaction
        super().__init__()
        
        self.num_input_features = num_input_features_per_transaction
        cnn_layers = []
        in_channels = 1 # We process features of one transaction as a 1-channel sequence

        current_seq_len = num_input_features_per_transaction
        for i, out_c in enumerate(cnn_channels):
            cnn_layers.append(nn.Conv1d(in_channels, out_c, kernel_sizes[i], padding='same'))
            cnn_layers.append(nn.ReLU())
            # Optional: Pooling if num_input_features_per_transaction is large
            # For example, if current_seq_len > kernel_sizes[i] * 2:
            #     pool_k = 2
            #     cnn_layers.append(nn.MaxPool1d(pool_k))
            #     current_seq_len //= pool_k
            in_channels = out_c
            
        self.cnn_block = nn.Sequential(*cnn_layers)
        
        # Calculate the flattened size after CNNs (if no pooling, it's cnn_channels[-1] * num_input_features)
        # If pooling is added, this needs to be calculated based on pooling effects
        # For simplicity, assume global average pooling to get fixed size
        self.adaptive_pool = nn.AdaptiveAvgPool1d(1) # Output (batch, cnn_channels[-1], 1)
        self.fc_out_cnn = nn.Linear(cnn_channels[-1], output_embedding_dim)

        print(f"TransactionFeatureCNN: InputFeat={num_input_features_per_transaction}, OutputEmb={output_embedding_dim}")

    def forward(self, x_single_transaction_features):
        # Input x_single_transaction_features: (batch_size_of_transactions, num_input_features_per_transaction)
        
        # Reshape for Conv1d: (batch_size_of_transactions, 1, num_input_features_per_transaction)
        x = x_single_transaction_features.unsqueeze(1) 
        
        x = self.cnn_block(x) # (batch_size_of_transactions, cnn_channels[-1], num_input_features_or_reduced)
        x = self.adaptive_pool(x) # (batch_size_of_transactions, cnn_channels[-1], 1)
        x = x.squeeze(-1) # (batch_size_of_transactions, cnn_channels[-1])
        
        transaction_embedding = self.fc_out_cnn(x)
        return transaction_embedding


class TransactionSequenceEncoder_CNNthenGRU(nn.Module):
    def __init__(self, 
                 num_features_per_transaction, # N_FEATURES_PROC
                 # Inner CNN params
                 cnn_internal_channels=[32, 64],
                 cnn_internal_kernel_sizes=[5, 3],
                 transaction_embedding_dim=128, # Output of inner CNN, input to GRU
                 # GRU params
                 gru_hidden_size=256, 
                 gru_layers=2, 
                 gru_bidirectional=True,
                 # Final embedding for the whole sequence
                 final_sequence_embedding_dim=64):
        super().__init__()

        self.transaction_cnn_embedder = TransactionFeatureCNN(
            num_input_features_per_transaction=num_features_per_transaction,
            cnn_channels=cnn_internal_channels,
            kernel_sizes=cnn_internal_kernel_sizes,
            output_embedding_dim=transaction_embedding_dim
        )

        self.gru = nn.GRU(
            input_size=transaction_embedding_dim, # Takes embeddings of individual transactions
            hidden_size=gru_hidden_size,
            num_layers=gru_layers,
            batch_first=True, # Expects (batch, seq_len, features)
            bidirectional=gru_bidirectional
        )
        
        gru_output_dim_for_fc = gru_hidden_size * (2 if gru_bidirectional else 1)
        self.fc_out_sequence = nn.Linear(gru_output_dim_for_fc, final_sequence_embedding_dim)

        print(f"TransactionSequenceEncoder_CNNthenGRU: FinalEmb={final_sequence_embedding_dim}")
        
    def forward(self, x_batch_of_sequences):
        # Input x_batch_of_sequences: (batch_size, sequence_length=10, num_features_per_transaction)
        
        batch_size, seq_len, num_features = x_batch_of_sequences.shape
        
        # To process each transaction in the sequence with the CNN:
        # 1. Reshape to treat all transactions across all sequences in the batch as one big batch for the CNN
        #    (batch_size * seq_len, num_features_per_transaction)
        x_flat_transactions = x_batch_of_sequences.reshape(batch_size * seq_len, num_features)
        
        # 2. Get embeddings for all transactions
        #    Output shape: (batch_size * seq_len, transaction_embedding_dim)
        transaction_embeddings_flat = self.transaction_cnn_embedder(x_flat_transactions)
        
        # 3. Reshape back to sequence format for the GRU
        #    Output shape: (batch_size, seq_len, transaction_embedding_dim)
        transaction_embeddings_sequence = transaction_embeddings_flat.reshape(batch_size, seq_len, -1)
        
        # 4. Pass sequence of transaction embeddings through GRU
        # self.gru.flatten_parameters()
        _, h_n = self.gru(transaction_embeddings_sequence)
        # h_n shape: (num_gru_layers * num_directions, batch_size, gru_hidden_size)
        
        if self.gru.bidirectional:
            # Concatenate the last hidden states of the last GRU layer
            gru_out_for_fc = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        else:
            gru_out_for_fc = h_n[-1,:,:]
        # gru_out_for_fc shape: (batch_size, gru_hidden_size * num_directions)
            
        sequence_embedding = self.fc_out_sequence(gru_out_for_fc)
        
        # Optional: Normalize final sequence embedding
        sequence_embedding = F.normalize(sequence_embedding, p=2, dim=1)
        
        return sequence_embedding

In [None]:
class ProjectionHead(nn.Module):
    def __init__(self, input_dim=ENCODER_EMBEDDING_DIM, hidden_dim=ENCODER_EMBEDDING_DIM, output_dim=PROJECTION_DIM):
        super(ProjectionHead, self).__init__()
        # Simple 2-layer MLP as projection head
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        print(f"Projection Head: Input={input_dim}, Hidden={hidden_dim}, Output={output_dim}")

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        # Note: NTXentLoss often includes normalization internally, or you can add it here
        # x = F.normalize(x, p=2, dim=1) # Optional normalization here
        return x

In [None]:
# Instantiate encoder
encoder = TransactionSequenceEncoder_CNNthenGRU(
    num_features_per_transaction=N_FEATURES_PROC,
    cnn_internal_channels=[32, 64],       # Example
    cnn_internal_kernel_sizes=[5, 3],     # Example
    transaction_embedding_dim=128,        # Example: output of CNN per transaction
    gru_hidden_size=256,                  # Example
    gru_layers=2,                         # Example
    final_sequence_embedding_dim=ENCODER_EMBEDDING_DIM # Your desired final SSL embedding dim
).to(DEVICE)
# encoder = CNNGRUEncoder(input_dim=N_FEATURES, embedding_dim=EMBEDDING_DIM).to(DEVICE)
encoder = torch.compile(encoder) # Compile the model after creating it
optimizer = optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
print("\nCNN-GRU Encoder Architecture:")
print(encoder)

In [None]:
def contrastive_loss(z, temperature):
    """
    Optimized implementation of NT-Xent loss using logsumexp.
    Assumes z = torch.cat([view1, view2], dim=0)
    where view1 and view2 have shape (B, D) and B is the batch size.
    """
    n = z.shape[0] # Shape is (2*B, D)
    if n < 2:
        # Handle edge case where batch size is too small after drop_last
        return torch.tensor(0.0, device=z.device, requires_grad=True)
    batch_size = n // 2 # The original batch size B

    # Calculate cosine similarity matrix (2B x 2B)
    # Normalize features first is equivalent to cosine similarity for matrix mult
    z_norm = F.normalize(z, p=2, dim=1)
    sim_matrix = torch.mm(z_norm, z_norm.t()) # (2B, D) @ (D, 2B) -> (2B, 2B)
    # Or using the function directly:
    #sim_matrix = F.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0), dim=2)


    # Scale similarities by temperature
    logits = sim_matrix / temperature

    # --- Identify positive pairs ---
    # Create labels identifying samples across the two views
    labels = torch.arange(batch_size).to(z.device) # 0 to B-1
    # Create a mask for positive pairs: (i, i+B) and (i+B, i)
    mask_pos = torch.zeros_like(logits, dtype=torch.bool)
    mask_pos[torch.arange(batch_size), torch.arange(batch_size) + batch_size] = True
    mask_pos[torch.arange(batch_size) + batch_size, torch.arange(batch_size)] = True

    # Extract the logits corresponding to positive pairs
    # These are the sim(z1_k, z2_k)/T and sim(z2_k, z1_k)/T terms
    positives = logits[mask_pos].view(n, 1) # Shape: (2B, 1)

    # --- Calculate LogSumExp over negatives ---
    # Mask out self-similarity (diagonal) for the logsumexp calculation
    mask_self = torch.eye(n, dtype=torch.bool).to(z.device)
    logits_masked = logits.masked_fill(mask_self, -float('inf')) # Exclude sim(i,i)

    # Calculate logsumexp across all other samples (negatives + the other positive)
    logsumexp_all = torch.logsumexp(logits_masked, dim=1, keepdim=True) # Shape: (2B, 1)

    # --- Calculate final loss ---
    # loss = log(sum(exp(negatives))) - positive_similarity
    loss_per_sample = logsumexp_all - positives

    # Average loss over all 2B samples (both views)
    loss = loss_per_sample.mean()

    return loss

In [None]:
print(f"Creating augmented dataset with {X_processed_features_np.shape[0]} samples and {N_FEATURES} features...")
augmented_dataset = YourAugmentedDataset(
    features_np_array=X_processed_features_np,
    # Add your augmentation parameters here if they are not defaults
    # noise_level=0.05,
    # mask_fraction=0.15
)

# Create the DataLoader for training
# num_workers > 0 can speed up data loading but might cause issues on some systems (e.g. Windows, Jupyter)
# Start with num_workers=0 if you encounter problems.
# pin_memory=True can speed up CPU to GPU data transfer if using CUDA.
dataloader = DataLoader(
    augmented_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2, # Adjust based on your system and if you see speedups/issues
    pin_memory=(DEVICE.type == 'cuda'),
    drop_last=True # Important for some contrastive losses if batch size consistency matters
                  # Set to False if you want to process all samples even if the last batch is smaller
)
print(f"Training DataLoader created: Batch Size={BATCH_SIZE}, Shuffle=True, Drop Last=True")


# Create the Dataset and DataLoader for evaluation/final embedding generation
# This uses a SimpleDataset that does not apply augmentations.
eval_dataset = SimpleDataset(X_processed_features_np)
eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=BATCH_SIZE, # Often can use a larger batch size for inference
    shuffle=False,
    num_workers=2,
    pin_memory=(DEVICE.type == 'cuda')
)
print(f"Evaluation DataLoader created: Batch Size={BATCH_SIZE * 2}, Shuffle=False")


In [None]:
# --- Model Initialization, Optimizer, Loss ---
print(f"Initializing models on {DEVICE}...")
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE.type == 'cuda'))

# Instantiate BOTH models
# Ensure N_FEATURES here is the number of features your preprocessed data has
encoder = TransactionSequenceEncoder_CNNthenGRU(
    num_features_per_transaction=N_FEATURES_PROC,
    cnn_internal_channels=[32, 64],       # Example
    cnn_internal_kernel_sizes=[5, 3],     # Example
    transaction_embedding_dim=128,        # Example: output of CNN per transaction
    gru_hidden_size=256,                  # Example
    gru_layers=2,                         # Example
    final_sequence_embedding_dim=ENCODER_EMBEDDING_DIM # Your desired final SSL embedding dim
).to(DEVICE)

# encoder = CNNGRUEncoder(input_dim=N_FEATURES, embedding_dim=ENCODER_EMBEDDING_DIM).to(DEVICE)
projection_head = ProjectionHead(input_dim=ENCODER_EMBEDDING_DIM, output_dim=PROJECTION_DIM).to(DEVICE)

# # Compile if desired (PyTorch 2.0+)
if DEVICE.type == 'cuda' and hasattr(torch, 'compile'):
    print("Attempting to compile models with torch.compile()...")
    try:
        encoder = torch.compile(encoder)
        projection_head = torch.compile(projection_head)
        print("Models compiled successfully.")
    except Exception as e:
        print(f"torch.compile failed: {e}. Proceeding without compilation.")
else:
    print("torch.compile not used (either not CUDA or torch version < 2.0).")


optimizer = optim.Adam(
    list(encoder.parameters()) + list(projection_head.parameters()), # Combine parameters
    lr=LEARNING_RATE,
    weight_decay=1e-6 # Example weight decay
)

# Use the library loss function (operates on projection head output)
criterion = NTXentLoss(temperature=TEMPERATURE, memory_bank_size=0).to(DEVICE) # Ensure loss is on device if it has params

print("\n--- CNN-GRU Encoder Architecture ---")
print(encoder)
print("\n--- Projection Head Architecture ---")
print(projection_head)
print(f"\nOptimizer: Adam, LR: {LEARNING_RATE}, Weight Decay: {1e-6}")
print(f"Loss Criterion: NTXentLoss, Temperature: {TEMPERATURE}")
print(f"Training for {EPOCHS} epochs. Batch size: {dataloader.batch_size if dataloader else 'N/A'}")
print(f"Using Mixed Precision (AMP): {scaler.is_enabled()}")


# --- Training Loop ---
print("\n--- Starting Unsupervised Training ---")
training_losses = [] # Renamed from 'losses' to avoid conflict if 'loss' is used later
gradient_check_interval = 600 # Batches between grad checks, set to 0 or large to disable
print_interval = 5000 # Batches between progress prints

for epoch in range(EPOCHS):
    encoder.train()
    projection_head.train()

    epoch_loss_sum = 0.0
    batches_processed_in_epoch = 0
    
    epoch_data_time = 0.0
    epoch_forward_time = 0.0
    epoch_loss_calc_time = 0.0
    epoch_backward_time = 0.0
    epoch_optimizer_step_time = 0.0
    epoch_grad_check_time = 0.0 # For the optional gradient check

    epoch_overall_start_time = time.time()
    batch_loop_start_time = time.time() # For data loading time of the first batch

    for batch_idx, (view1, view2) in enumerate(dataloader):
        batch_data_end_time = time.time()
        epoch_data_time += (batch_data_end_time - batch_loop_start_time)

        # Ensure batch is large enough for contrastive loss (NT-Xent often needs at least 2 distinct samples)
        if view1.shape[0] < 2:
            print(f"Skipping batch {batch_idx+1} in epoch {epoch+1} due to insufficient size: {view1.shape[0]}")
            batch_loop_start_time = time.time() # Reset for next data load time
            continue

        view1, view2 = view1.to(DEVICE), view2.to(DEVICE)

        # --- Forward Pass & Loss Calculation within Autocast ---
        forward_pass_start_time = time.time()
        with torch.amp.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=(DEVICE.type == 'cuda')):
            z1_enc = encoder(view1)
            z2_enc = encoder(view2)
            p1 = projection_head(z1_enc)
            p2 = projection_head(z2_enc)
            
            loss_calc_start_time = time.time()
            current_batch_loss = criterion(p1, p2)
            loss_calc_end_time = time.time()
        forward_pass_end_time = loss_calc_start_time # Forward pass ends before loss calc starts
        
        epoch_forward_time += (forward_pass_end_time - forward_pass_start_time)
        epoch_loss_calc_time += (loss_calc_end_time - loss_calc_start_time)

        # --- Backpropagation & Optimizer Step ---
        optimizer.zero_grad(set_to_none=True) # More memory efficient

        if torch.isnan(current_batch_loss) or torch.isinf(current_batch_loss):
            print(f"WARNING: NaN/Inf loss detected at Epoch {epoch+1}, Batch {batch_idx+1}. Skipping update.")
            batch_loop_start_time = time.time() # Reset for next data load time
            continue
        
        backward_pass_start_time = time.time()
        scaler.scale(current_batch_loss).backward()
        backward_pass_end_time = time.time()
        epoch_backward_time += (backward_pass_end_time - backward_pass_start_time)

        # Optional Gradient Checking
        if gradient_check_interval > 0 and (batch_idx + 1) % gradient_check_interval == 0:
            gc_start = time.time()
            #print(f"\n--- Gradient Stats for Epoch {epoch+1}, Batch {batch_idx+1} ---")
            all_params = list(encoder.named_parameters()) + list(projection_head.named_parameters())
            for name, param in all_params:
                if param.grad is not None:
                    grad_norm = param.grad.norm().item()
                    grad_abs_mean = param.grad.abs().mean().item()
                    # print(f"{name:<50}: Grad Norm={grad_norm:.4e}, Grad Abs Mean={grad_abs_mean:.4e}")
                # else:
                    # print(f"{name:<50}: Grad is None")
            epoch_grad_check_time += (time.time() - gc_start)

        optimizer_step_start_time = time.time()
        # Optional: Gradient Clipping (Unscale first)
        # if scaler.is_enabled(): scaler.unscale_(optimizer)
        # torch.nn.utils.clip_grad_norm_(list(encoder.parameters()) + list(projection_head.parameters()), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        optimizer_step_end_time = time.time()
        epoch_optimizer_step_time += (optimizer_step_end_time - optimizer_step_start_time)

        epoch_loss_sum += current_batch_loss.item()
        batches_processed_in_epoch += 1

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(dataloader):
            if batches_processed_in_epoch > 0:
                avg_loss_so_far = epoch_loss_sum / batches_processed_in_epoch
                print(f"Epoch [{epoch+1}/{EPOCHS}], Batch [{batch_idx+1}/{len(dataloader)}], Loss: {current_batch_loss.item():.4f}, Avg Epoch Loss: {avg_loss_so_far:.4f}")
        
        batch_loop_start_time = time.time() # Reset for next data load time calculation

    # --- End of Epoch Summary ---
    epoch_overall_end_time = time.time()
    epoch_duration = epoch_overall_end_time - epoch_overall_start_time

    if batches_processed_in_epoch > 0:
        avg_epoch_loss = epoch_loss_sum / batches_processed_in_epoch
        training_losses.append(avg_epoch_loss)
        print(f"\n--- Epoch [{epoch+1}/{EPOCHS}] Summary ---")
        print(f"  Average Epoch Loss: {avg_epoch_loss:.4f}")
        print(f"  Epoch Duration: {epoch_duration:.2f}s")
        print(f"  Avg Time per Batch (Overall): {epoch_duration / batches_processed_in_epoch:.3f}s")
        # Print detailed timings (average per batch)
        print(f"  Avg Data Loading: {epoch_data_time / batches_processed_in_epoch:.4f}s")
        print(f"  Avg Forward Pass: {epoch_forward_time / batches_processed_in_epoch:.4f}s")
        print(f"  Avg Loss Calc:    {epoch_loss_calc_time / batches_processed_in_epoch:.4f}s")
        print(f"  Avg Backward Pass: {epoch_backward_time / batches_processed_in_epoch:.4f}s")
        print(f"  Avg Optimizer Step: {epoch_optimizer_step_time / batches_processed_in_epoch:.4f}s")
        if epoch_grad_check_time > 0:
             num_grad_checks = batches_processed_in_epoch // gradient_check_interval if gradient_check_interval > 0 else 0
             if num_grad_checks > 0:
                 print(f"  Avg Grad Check Time (per check): {epoch_grad_check_time / num_grad_checks:.4f}s")
    else:
        print(f"Epoch [{epoch+1}/{EPOCHS}] completed with no batches processed.")
        training_losses.append(float('nan')) # Or handle appropriately

print("\n--- Unsupervised Training Script Finished ---")

In [None]:
# --- At the beginning of embedding generation ---
print("\n--- Generating Final Embeddings (with pre-allocation attempt) ---")
encoder.eval()

if 'eval_dataset' not in locals() or eval_dataset is None: # Make sure eval_dataset is defined
    print("ERROR: eval_dataset is not defined!")
    # exit()
else:
    num_total_samples = len(eval_dataset)
    # ENCODER_EMBEDDING_DIM needs to be correctly defined
    # Example: ENCODER_EMBEDDING_DIM = encoder.fc_out.out_features (if fc_out is the last layer)
    # Or pass it as a known constant
    if 'ENCODER_EMBEDDING_DIM' not in locals():
        print("ERROR: ENCODER_EMBEDDING_DIM not defined!")
        # exit()
    else:
        print(f"DEBUG: Pre-allocating NumPy array for {num_total_samples} samples, {ENCODER_EMBEDDING_DIM} embedding dim.")
        # Ensure ENCODER_EMBEDDING_DIM is correct
        final_embeddings_np = np.zeros((num_total_samples, ENCODER_EMBEDDING_DIM), dtype=np.float32)
        current_idx = 0

        if DEVICE.type == 'cuda': torch.cuda.empty_cache()

        with torch.no_grad():
            for batch_idx, data_batch in enumerate(eval_dataloader): # Use your existing eval_dataloader
                # ... (data loading and feature extraction from data_batch as before) ...
                if isinstance(data_batch, (list, tuple)):
                    batch_features = data_batch[0].to(DEVICE)
                else:
                    batch_features = data_batch.to(DEVICE)
                
                if (batch_idx + 1) % 1000 == 0: print(f"  Processing batch {batch_idx + 1}/{len(eval_dataloader)}...")

                if batch_features.shape[0] == 0: continue

                try:
                    with torch.amp.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=(DEVICE.type == 'cuda')):
                        current_embeddings_tensor = encoder(batch_features)
                    
                    batch_actual_size = current_embeddings_tensor.shape[0]
                    # Place directly into the pre-allocated NumPy array
                    final_embeddings_np[current_idx : current_idx + batch_actual_size] = current_embeddings_tensor.cpu().numpy()
                    current_idx += batch_actual_size

                except RuntimeError as e:
                    print(f"  RUNTIME ERROR on batch {batch_idx + 1}: {e}")
                    if "out of memory" in str(e).lower(): print("    CUDA OOM error.")
                    break 
                except Exception as e_other:
                    print(f"  UNEXPECTED PYTHON ERROR on batch {batch_idx + 1}: {e_other}")
                    break
        
        if current_idx < num_total_samples:
            print(f"Warning: Only {current_idx} embeddings were filled out of {num_total_samples} expected.")
            final_embeddings_np = final_embeddings_np[:current_idx] # Trim if processing stopped early

        print(f"Generated final embeddings shape: {final_embeddings_np.shape}")
        # Now `final_embeddings_np` is your result, no need for all_embeddings_list or concatenate

In [None]:
# --- Stage 2: Prepare Labels for Evaluation ---
from sklearn.metrics import average_precision_score
y_known = None
# This logic assumes X_processed_features_np exists and its row count matches what eval_dataset was based on.
# And df_raw has the original labels.
# **Crucially, y_known must align with the samples in final_embeddings_np**
if 'X_processed_features_np' in locals() and 'df_raw' in locals() and 'Is Laundering' in df_raw.columns:
    # If your preprocessing function `preprocess_for_ssl` returned y_target that corresponds
    # to X_processed_features_np, it's better to use that.
    # y_known = y_target_from_preprocessing 
    
    # Fallback: using df_raw and assuming X_processed_features_np corresponds to the head of df_raw
    # or that their lengths are made to match.
    num_samples_for_labels = len(X_processed_features_np) if 'X_processed_features_np' in locals() else (len(final_embeddings_np) if final_embeddings_np is not None else 0)

    if num_samples_for_labels > 0 and num_samples_for_labels <= len(df_raw):
        y_known = df_raw['Is Laundering'].values[:num_samples_for_labels]
        # known_indices = np.arange(len(y_known)) # Not strictly needed if y_known aligns directly
        print(f"Extracted {len(y_known)} labels for evaluation, matching {num_samples_for_labels} processed samples.")
    else:
        print(f"Warning: Cannot reliably extract y_known. num_samples_for_labels={num_samples_for_labels}, len(df_raw)={len(df_raw) if 'df_raw' in locals() else 'N/A'}")
elif 'y_target_from_preprocessing' in locals() and y_target_from_preprocessing is not None:
    # Ideal case if your preprocessing function returns the aligned y_target
    y_known = y_target_from_preprocessing
    print(f"Using y_target_from_preprocessing with {len(y_known)} labels.")
else:
    print("Warning: Could not determine source for 'y_known'. Labels not available for evaluation.")


# --- Stage 3: Evaluate Embeddings with RandomForest Probe ---
print("\n--- Evaluating Learned Embeddings with RandomForest Probe ---")

# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
# Key Change: Use final_embeddings_np instead of all_embeddings
# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
if final_embeddings_np is not None and y_known is not None and len(y_known) > 0:
    if len(final_embeddings_np) != len(y_known):
        print(f"ERROR: Mismatch between number of embeddings ({len(final_embeddings_np)}) and labels ({len(y_known)}). Probe cannot proceed.")
        print("       Ensure y_known correctly corresponds to the samples for which embeddings were generated.")
    else:
        embeddings_for_probe = final_embeddings_np # Use the correctly named variable
        print(f"Using {len(y_known)} labeled samples for RF probe.")
        print(f"Embedding shape for probe: {embeddings_for_probe.shape}")

        try:
            X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(
                embeddings_for_probe,
                y_known,
                test_size=0.3,
                random_state=42,
                stratify=y_known
            )
            print(f"RF Probe: Train size={len(y_train_emb)}, Test size={len(y_test_emb)}")
            
            # Check class distribution
            if len(y_train_emb) > 0: print(f"Class distribution in y_train_emb: {np.bincount(y_train_emb)}")
            if len(y_test_emb) > 0: print(f"Class distribution in y_test_emb: {np.bincount(y_test_emb)}")

            if len(np.unique(y_train_emb)) < 2 or len(np.unique(y_test_emb)) < 2 :
                print("Warning: One of the splits (train or test) for RF probe has only one class. Metrics might be misleading or fail.")
                # Proceeding, but be aware of potential issues with metrics.

            rf_probe = RandomForestClassifier(
                n_estimators=100,
                max_depth=None,
                random_state=42,
                n_jobs=-1,
                class_weight='balanced_subsample',
                min_samples_leaf=5,
                min_samples_split=10
            )

            print("\nTraining RandomForest Probe...")
            fit_start_time = time.time()
            rf_probe.fit(X_train_emb, y_train_emb)
            fit_end_time = time.time()
            print(f"RF probe training completed in {fit_end_time - fit_start_time:.2f} seconds.")

            print("\nEvaluating RF Probe on the test set...")
            y_pred_probe_rf = rf_probe.predict(X_test_emb)
            y_pred_proba_probe_rf = rf_probe.predict_proba(X_test_emb)[:, 1]

            accuracy_probe_rf = accuracy_score(y_test_emb, y_pred_probe_rf)
            precision_probe_rf, recall_probe_rf, f1_probe_rf, _ = precision_recall_fscore_support(
                y_test_emb, y_pred_probe_rf, average='binary', pos_label=1, zero_division=0
            )
                        
            auprc_probe_rf = average_precision_score(y_test_emb, y_pred_proba_probe_rf, pos_label=1)
            
            auroc_probe_rf = float('nan')
            if len(np.unique(y_test_emb)) > 1:
                try: auroc_probe_rf = roc_auc_score(y_test_emb, y_pred_proba_probe_rf)
                except ValueError as e_auroc: print(f"Could not calculate AUROC for RF probe: {e_auroc}")
            else: print("AUROC cannot be calculated for RF probe: only one class in y_test_emb.")

            print("\n--- RandomForest Probe Evaluation Results ---")
            print(f"Accuracy (RF Probe): {accuracy_probe_rf:.4f}")
            print(f"AUROC (RF Probe):    {auroc_probe_rf:.4f}")
            print(f"AUPRC (Average Precision) (RF Probe): {auprc_probe_rf:.4f}")
            print(f"Precision (Illicit): {precision_probe_rf:.4f}")
            print(f"Recall (Illicit):    {recall_probe_rf:.4f}")
            print(f"F1-Score (Illicit):  {f1_probe_rf:.4f}")
            print("\nClassification Report (RF Probe - Test Set):")
            print(classification_report(y_test_emb, y_pred_probe_rf, target_names=["Licit (0)", "Illicit (1)"], zero_division=0))

            if hasattr(rf_probe, 'feature_importances_'):
                importances_rf = rf_probe.feature_importances_
                emb_indices = np.argsort(importances_rf)[::-1]
                print("\nTop 10 Embedding Dimension Importances (RF Probe):")
                for i in range(min(10, len(emb_indices))):
                    print(f"  Dim {emb_indices[i]}: {importances_rf[emb_indices[i]]:.4f}")

        except ValueError as e_split:
            print(f"Error during train/test split or RF evaluation: {e_split}")
        except Exception as e_general:
            print(f"An unexpected error occurred during RF probe evaluation: {e_general}")
else:
    if final_embeddings_np is None:
        print("\nSkipping RF Probe evaluation: 'final_embeddings_np' was not generated successfully.")
    elif y_known is None or len(y_known) == 0:
        print("\nSkipping RF Probe evaluation: 'y_known' labels are not available or empty.")

gc.collect() # Clean up at the end