In [29]:
from torch import nn
import torch

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

In [30]:
## Downloading the database
import kagglehub


# Download latest version
path = kagglehub.dataset_download("ellipticco/elliptic-data-set")

print("Path to dataset files:", path)

Path to dataset files: /Users/anatol/.cache/kagglehub/datasets/ellipticco/elliptic-data-set/versions/1


In [31]:
classes_df = pd.read_csv(path + "/elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
features_df = pd.read_csv(path + "/elliptic_bitcoin_dataset/elliptic_txs_features.csv")
edge_df = pd.read_csv(path + "/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv")

print(classes_df.head(), end="\n\n")
print(features_df.head(), end="\n\n")
print(edge_df.head())

        txId    class
0  230425980  unknown
1    5530458  unknown
2  232022460  unknown
3  232438397        2
4  230460314  unknown

   230425980  1  -0.1714692896288031  -0.18466755143291433  \
0    5530458  1            -0.171484             -0.184668   
1  232022460  1            -0.172107             -0.184668   
2  232438397  1             0.163054              1.963790   
3  230460314  1             1.011523             -0.081127   
4  230459870  1             0.961040             -0.081127   

   -1.2013688016765636  -0.12196959975910057  -0.04387454791734898  \
0            -1.201369             -0.121970             -0.043875   
1            -1.201369             -0.121970             -0.043875   
2            -0.646376             12.409294             -0.063725   
3            -1.201369              1.153668              0.333276   
4            -1.201369              1.303743              0.333276   

   -0.11300200928476244  -0.06158379407303222  -0.16209679981659642  ... 

In [32]:
feature_headers = ['txId', 'Time step'] + [f'f{i}' for i in range(1, 166)]
features_df.columns = feature_headers
print(features_df.columns)
df = pd.merge(features_df, classes_df, on='txId', how='left')

# Map classes for evaluation: 1 (illicit) -> 1, 2 (licit) -> 0, unknown -> NaN
df['class'] = df['class'].map({'1': 1, '2': 0})
known_indices = df.index[df['class'].notna()].tolist()
y_known = df.loc[known_indices, 'class'].values.astype(int)
# Map classes for evaluation: 1 (illicit) -> 1, 2 (licit) -> 0, unknown -> NaN
df['class'] = df['class'].map({'1': 1, '2': 0})

# Separate features (X) and labels (y)
# Features: Time step + f1 to f165
X_cols = ['Time step'] + [f'f{i}' for i in range(1, 166)]
X_all = df[X_cols].values
y_all = df['class'].values # Contains NaN for unknowns

Index(['txId', 'Time step', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       ...
       'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164',
       'f165'],
      dtype='object', length=167)


In [33]:
print(features_df.columns)
print(classes_df.shape)
print(features_df.shape)

Index(['txId', 'Time step', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       ...
       'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164',
       'f165'],
      dtype='object', length=167)
(203769, 2)
(203768, 167)


In [34]:
X = features_df.iloc[:, 0:95]
y = classes_df[0:-1]["class"]
print(y.shape)
print(X.shape)
print(features_df.shape)

(203768,)
(203768, 95)
(203768, 167)


In [35]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

Using device: mps


In [None]:
# Scale features (fit only on training data, but here we apply to all for unsupervised)
print("Scaling features...")
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all)

In [None]:
# --- Assume previous sections have run successfully ---
# 1. Data Loading (features_df, classes_df merged into df)
# 2. Feature Scaling (X_all_scaled created)
# 3. Label Preparation (known_indices, y_known created)
# --- Required variables from previous steps: ---
# X_all_scaled: Numpy array of all scaled features (N_samples, N_features)
# known_indices: List or array of indices corresponding to labeled data
# y_known: Numpy array of true labels (0 or 1) for the known data

# --- Supervised Baseline using RandomForest ---

print("\n--- Starting Supervised Baseline Training (RandomForest) ---")

if 'X_all_scaled' not in locals() or 'known_indices' not in locals() or 'y_known' not in locals():
    print("Error: Prerequisite data (X_all_scaled, known_indices, y_known) not found. Ensure data loading and prep ran.")
    # Handle error appropriately, maybe exit()
    exit()
elif len(known_indices) == 0:
    print("Error: No known labels available for supervised training.")
    exit()
else:
    # Select the features and labels for the known data
    X_known_scaled = X_all_scaled[known_indices]
    # y_known is already defined

    print(f"Using {len(y_known)} labeled samples for training and testing.")
    print(f"Feature shape: {X_known_scaled.shape}")

    # Split the labeled data into training and testing sets
    # stratify=y_known ensures similar class proportions in train/test sets
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X_known_scaled,
            y_known,
            test_size=0.3,       # Use 30% for testing
            random_state=42,   # For reproducibility
            stratify=y_known   # Important for imbalanced data
        )
        print(f"Train set size: {len(y_train)}, Test set size: {len(y_test)}")
        print(f"Illicit in train: {np.sum(y_train==1)}, Illicit in test: {np.sum(y_test==1)}")

    except ValueError as e:
         print(f"Error during train_test_split (potentially too few samples of one class for stratification): {e}")
         exit()


    # Initialize RandomForestClassifier
    # Hyperparameters can be tuned, these are reasonable defaults
    # class_weight='balanced' helps with imbalanced data
    rf_classifier = RandomForestClassifier(
        n_estimators=150,      # Number of trees
        max_depth=20,          # Limit tree depth to prevent overfitting (tune this)
        random_state=42,
        n_jobs=-1,             # Use all CPU cores
        class_weight='balanced' # Important for imbalanced classes
        # min_samples_leaf=5   # Can also help prevent overfitting
    )

    # Train the model
    print("\nTraining RandomForest model...")
    start_time = time.time()
    rf_classifier.fit(X_train, y_train)
    end_time = time.time()
    print(f"RandomForest training completed in {end_time - start_time:.2f} seconds.")

    # Make predictions on the test set
    print("\nMaking predictions on the test set...")
    y_pred_rf = rf_classifier.predict(X_test)
    # Get probability predictions for AUROC calculation
    # Predict probability of the positive class (illicit=1)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)[:, 1]

    # Evaluate the results
    print("\n--- Supervised RandomForest Evaluation Results ---")
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='binary', pos_label=1, zero_division=0)

    try:
        # Check if both classes are present in y_test for AUROC
        if len(np.unique(y_test)) > 1:
             auroc_rf = roc_auc_score(y_test, y_pred_proba_rf) # Use probabilities for AUROC
        else:
             print("AUROC requires multiple classes in y_test. Skipping AUROC calculation.")
             auroc_rf = float('nan')
    except ValueError as e:
        print(f"Could not calculate AUROC: {e}")
        auroc_rf = float('nan')

    print(f"Accuracy (RF): {accuracy_rf:.4f}")
    print(f"AUROC (RF):    {auroc_rf:.4f}") # <<<--- THIS IS THE KEY BASELINE METRIC
    print(f"Precision (for illicit=1): {precision_rf:.4f}")
    print(f"Recall (for illicit=1):    {recall_rf:.4f}")
    print(f"F1-Score (for illicit=1):  {f1_rf:.4f}")

    print("\nClassification Report (RF - Test Set):")
    print(classification_report(y_test, y_pred_rf, target_names=["Licit (0)", "Illicit (1)"], zero_division=0))

    # Optional: Feature Importance
    # importances = rf_classifier.feature_importances_
    # feature_names = X_cols # From data loading section
    # importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
    # importance_df = importance_df.sort_values('importance', ascending=False)
    # print("\nTop 10 Feature Importances:")
    # print(importance_df.head(10))

# --- Rest of your script (if any) ---