In [1]:
from preprocessing import *
from models import *
from main import *
from data import *
from joblib import load

## 1. Load Data

In [2]:
nativetx_df = pd.read_parquet('dataset/raw_data/transaction_native_seeder.parquet')
tokentx_df = pd.read_parquet('dataset/raw_data/transaction_token_seeder.parquet')
seed_df = pd.read_csv('dataset/raw_data/seed_labels.csv')

## 2. Generating training data

### We'll begin by establishing the training data for our models by determining the total number of addresses that are present in seed_df which are actually active in the test sets, nativetx_df and tokentx_df.


In [3]:
seed_addr = set(seed_df.eoa.unique())
seed_native_addr = set(seed_df.loc[seed_df["eoa"].isin(nativetx_df["FROM_ADDRESS"]) | seed_df["eoa"].isin(nativetx_df["TO_ADDRESS"]) | seed_df["eoa"].isin(nativetx_df["ORIGIN_TO_ADDRESS"]) | seed_df["eoa"].isin(nativetx_df["ORIGIN_FROM_ADDRESS"]), "eoa"].unique())
seed_token_addr = set(seed_df.loc[seed_df["eoa"].isin(tokentx_df["FROM_ADDRESS"]) | seed_df["eoa"].isin(tokentx_df["TO_ADDRESS"]) | seed_df["eoa"].isin(tokentx_df["ORIGIN_TO_ADDRESS"]) | seed_df["eoa"].isin(tokentx_df["ORIGIN_FROM_ADDRESS"]) | seed_df["eoa"].isin(tokentx_df["CONTRACT_ADDRESS"]), "eoa"].unique())
cex_addresses = seed_df.loc[seed_df["prediction"] == 1, "eoa"].tolist()
print(f'Total active addresses from seed_df in tokentx_df: {len(seed_token_addr)} out of {len(seed_addr)}')
print(f'Total active addresses from seed_df in nativetx_df: {len(seed_native_addr)} out of {len(seed_addr)}')
print(f'Total active CEX addresses in seed_df: {len(cex_addresses)} out of {len(seed_addr)}')

Total active addresses from seed_df in tokentx_df: 6708 out of 8619
Total active addresses from seed_df in nativetx_df: 8101 out of 8619
Total active CEX addresses in seed_df: 302 out of 8619


In [4]:
columns_to_check_tokentx = tokentx_df[["FROM_ADDRESS", "TO_ADDRESS", "ORIGIN_TO_ADDRESS", "ORIGIN_FROM_ADDRESS", "CONTRACT_ADDRESS"]]
columns_to_check_nativetx = nativetx_df[["FROM_ADDRESS", "TO_ADDRESS", "ORIGIN_TO_ADDRESS", "ORIGIN_FROM_ADDRESS"]]

native_addr = find_addresses_in_df(nativetx_df, columns_to_check_nativetx, cross_reference=False)
token_addr = find_addresses_in_df(tokentx_df, columns_to_check_tokentx, cross_reference=False)
active_tokentx_addr = find_addresses_in_df(tokentx_df, columns_to_check_tokentx, seed_addr)
active_nativetx_addr = find_addresses_in_df(nativetx_df, columns_to_check_nativetx, seed_addr)
active_addr = list(active_tokentx_addr.union(active_nativetx_addr))
print(f'Unique addresses in nativetx_df: {len(native_addr)}')
print(f'Unique addresses in tokentx_df: {len(token_addr)}')
print(f"Total addresses from seed_df['eoa'] found in tokentx_df and nativetx_df: {len(active_addr)}.")
print(f"{len(active_nativetx_addr & active_tokentx_addr)} unique addresses in common on both sets.")

Unique addresses in nativetx_df: 60604
Unique addresses in tokentx_df: 52719
Total addresses from seed_df['eoa'] found in tokentx_df and nativetx_df: 8116.
6693 unique addresses in common on both sets.


In [5]:
cex_in_tokentx = find_addresses_in_df(tokentx_df, columns_to_check_tokentx, set(cex_addresses))
cex_in_nativetx = find_addresses_in_df(nativetx_df, columns_to_check_nativetx, set(cex_addresses))
print(f"{len(cex_in_tokentx)} CEX addresses found in tokentx_df.")
print(f"{len(cex_in_nativetx)} CEX addresses found in nativetx_df.")
print(f"{len(set(cex_in_nativetx) & set(cex_in_tokentx))} unique CEX addresses in common on both sets.")

78 CEX addresses found in tokentx_df.
204 CEX addresses found in nativetx_df.
78 unique CEX addresses in common on both sets.


In [6]:
seed_active_df = seed_df[seed_df["eoa"].isin(active_nativetx_addr)]
seed_active_df = seed_active_df.rename(columns={"eoa": 'address'})
seed_active_df = seed_active_df.reset_index(drop=True)

### Using the cluster_sizes function, we'll establish a treshold to determine, in respect to a specific dataset, what's considered to be a "large cluster". We'll then adjust this number later to the pipeline preprocessing function.

In [7]:
# nativetx_cluster_size = cluster_sizes(nativetx_df, active_nativetx_addr, resolution=1.5)
# tokentx_cluster_size = cluster_sizes(tokentx_df, active_nativetx_addr, resolution=1.5)
# print(f'nativetx_df cluster sizes: {nativetx_cluster_size.describe()}')
# print('='*50)
# print(f'tokentx_df cluster sizes: {tokentx_cluster_size.describe()}')

## 3. Training our models

### We've obtained a new DataFrame with only the active addresses and their predictions, which consists of 8,101 unique addresses. Since we previously established that nativetx_df contains the bulk of the activity of these addresses, we'll train our model based on the behavior they exhibit there. We'll process the data through our pipeline function, stored in preprocessing.py, to obtain a clean DataFrame with our engineered features, ready for training.

In [8]:
#train_df = pipeline_processing(nativetx_df, active_nativetx_addr, timestamp_col="BLOCK_TIMESTAMP", resolution= 1.5, min_cluster_size=1000)

### Before training our model, we'll add the 'prediction' column and double check that the preprocessing didn't shuffle the data associated with each address, hence altering the prediction tags.

In [9]:
# train_df = train_df.merge(
#         seed_active_df[['address', 'prediction']],
#         left_on='address',
#         right_on='address',
#         how='left')

In [10]:
# merged_df = seed_active_df[['address', 'prediction']].merge(
#     train_df[['address', 'prediction']],
#     on='address',
#     how='inner',
#     suffixes=('_df1', '_df2')
# )
# merged_df['predictions_match'] = merged_df['prediction_df1'] == merged_df['prediction_df2']

# mismatches = merged_df[~merged_df['predictions_match']]

# print(f"Total addresses: {len(merged_df)}")
# print(f"Addresses with matching predictions: {len(merged_df[merged_df['predictions_match']])}")
# print(f"Addresses with mismatched predictions: {len(mismatches)}")

# if not mismatches.empty:
#     print("\nMismatched predictions:")
#     print(mismatches)
# else:
#     print("\nAwesome, all predictions match!")

### For our task of detecting CEX/Bridge addresses we'll train and compare three different models: a Random Forest Classifier, a Logistic Regression, and an Isolation Forest. We'll save our train_df to be able to call our main function and initialize, compile, train, evaluate and compare all three models, in order to see which works best.

In [11]:
#train_df.to_csv("dataset/processed_data/train_data.csv", index=False)

In [30]:
train_df = pd.read_csv("dataset/processed_data/train_data.csv")
X = ['outgoing_volume_USD', 'incoming_volume_USD', 'total_volume_USD', 'unique_interactions', 'tx_count', 'tx_per_hour', 'active_days', 'is_large_cluster', 'interaction_volume_USD']
y = 'prediction'

In [31]:
train_df.describe()

Unnamed: 0,outgoing_volume_USD,incoming_volume_USD,total_volume_USD,unique_interactions,tx_count,tx_per_hour,active_days,is_large_cluster,interaction_volume_USD,prediction
count,8101.0,8101.0,8101.0,8101.0,8101.0,8101.0,8101.0,8101.0,8101.0,8101.0
mean,157.140641,193.224182,350.364823,20.36835,55.363659,1.872795,251.85061,0.413776,10224.28,0.025182
std,1098.738813,1082.746526,2082.866882,32.505507,126.366408,1.38828,186.641647,0.49254,140384.3,0.156687
min,0.0,0.0,0.03,0.0,1.0,1.0,0.000174,0.0,0.0,0.0
25%,5.444286,9.056667,16.18,4.0,6.0,1.4,72.517072,0.0,97.88,0.0
50%,21.481,30.82,55.420606,11.0,22.0,1.684932,242.789769,0.0,731.9931,0.0
75%,88.202,120.053333,212.061667,27.0,63.0,2.035714,395.354803,1.0,3927.859,0.0
max,77217.274304,71435.731667,148653.00597,952.0,5382.0,100.8,595.514769,1.0,8158001.0,1.0


In [21]:
sup_models = main_sup(train_df, X, y)

Class distribution before SMOTETomek: Counter({0: 5060, 1: 124})
Class distribution after SMOTETomek: Counter({0: 5006, 1: 2476})

Training random_forest ✅
Best Parameters for RandomForestClassifier: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}

Evaluation metrics for Validation set:
Accuracy: 0.9737654320987654
F1-Score (Weighted): 0.9759854231741782
F1-Score (Macro): 0.8265060240963855
ROC-AUC Score: 0.9781746031746031
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      1254
           1       0.57      0.81      0.67        42

    accuracy                           0.97      1296
   macro avg       0.78      0.89      0.83      1296
weighted avg       0.98      0.97      0.98      1296

Confusion Matrix:
 [[1228   26]
 [   8   34]]

Evaluation metrics for Test set:
Accuracy: 0.9710055521283159
F1-Score (Weighted): 0.9744785868846179
F1-S

In [32]:
if_model = iso_forest(train_df, X, y)

Anomaly-based CEX Flags: 810
Best Parameters: {'contamination': 0.1, 'max_features': 0.5, 'max_samples': 200, 'n_estimators': 200}
ROC-AUC Score: 0.5191149778893449
Confusion Matrix:
 [[7115  782]
 [ 176   28]]
Precision: 0.0345679012345679
Recall: 0.13725490196078433
F1-Score: 0.055226824457593686
Model saved to results/isolation_forest_model.pkl
Results saved to results/isolation_forest_predictions.csv


In [14]:
if_model[0]

Unnamed: 0,address,is_cex
0,0x50b0aabf36b21e72add83b8904cb52bfe0171f66,0
1,0x4e29fa717fb61753e26885421b84ff7e06df585e,0
2,0x5507dbd48a5a5bace8a6030e878cc4e0af147c33,0
3,0xd9185e233575f4e0d0e83159fdc6dfe9107bbf4d,0
4,0xd2578c95c2daf87e7542d4c305c95cef01295877,0
...,...,...
8096,0xb642d13f3bf889a4c789df387c35caa3e007207d,0
8097,0xe7064cc9eb9a44a67162db2b6275dfedb3c490c2,0
8098,0x1765585f0177b378b6819118a0f1a7822b141913,0
8099,0xbef2d9718f241c874b1ca09e4f757888ce95d57b,0


In [33]:
# Merge the two DataFrames on the address column (not the index)
merged_df = pd.merge(if_model[0], train_df[['address', 'prediction']], on='address', how='left')

# Identify incorrectly flagged addresses (is_cex = 1 but prediction = 0)
incorrectly_flagged = merged_df[(merged_df['is_cex'] == 1) & (merged_df['prediction'] == 0)]

# Create a DataFrame with the incorrectly flagged addresses
incorrectly_flagged_df = incorrectly_flagged[['address', 'is_cex', 'prediction']]

# Output the results
print(f"Number of incorrectly flagged addresses: {len(incorrectly_flagged_df)}")
print("\nIncorrectly Flagged Addresses:")
print(incorrectly_flagged_df)

Number of incorrectly flagged addresses: 782

Incorrectly Flagged Addresses:
                                         address  is_cex  prediction
2     0x5507dbd48a5a5bace8a6030e878cc4e0af147c33       1           0
4     0xd2578c95c2daf87e7542d4c305c95cef01295877       1           0
5     0x81e877dd467f65b79aff559a8fafed6e95f01ad8       1           0
9     0xf1e7dbe363dcb884b2d860e7dd7b3a675486e5ee       1           0
14    0xd6ca66a00997a2038a6dc7304131a40f48268f77       1           0
...                                          ...     ...         ...
7895  0x5e76ff23db38fab6746d1026daa53ef9dde15614       1           0
7921  0x0bc4b9b8e6551c848beae8fe1375b470f8a3b206       1           0
7954  0xc96d84ec51362b7cf37c155dd842502a0b904dc6       1           0
7956  0xde7a3869b325e8e609e8ccbee818cc27eedbf91a       1           0
7958  0x1b107a7ff0be3c77f93543047d1e09437dbc7133       1           0

[782 rows x 3 columns]


Preparing test data from nativetx_df and tokentx_df

In [54]:
nativetx_large_clusters = cluster_sizes(nativetx_df, native_addr, resolution= 1.5)
nativetx_large_clusters.cluster_size.describe()

count    60604.00000
mean       830.73444
std       1129.62028
min          0.00000
25%         54.00000
50%        487.00000
75%        902.00000
max       4444.00000
Name: cluster_size, dtype: float64

In [55]:
tokentx_large_clusters = cluster_sizes(tokentx_df, token_addr, resolution= 1.5)
tokentx_large_clusters.cluster_size.describe()

count    52719.000000
mean       625.719627
std        809.488410
min          0.000000
25%          0.000000
50%        100.000000
75%       1091.000000
max       2393.000000
Name: cluster_size, dtype: float64

In [None]:
# native_test = pipeline_processing(nativetx_df, native_addr, origin_from_col='ORIGIN_FROM_ADDRESS',
#                                   origin_to_col='ORIGIN_TO_ADDRESS', timestamp_col="BLOCK_TIMESTAMP",
#                                   resolution= 1.5, min_cluster_size=900)

Volume processing done ✅
Unique interactions processing done ✅
Activity processing done ✅
Cluster processing done ✅
All done ✅


In [None]:
# token_test = pipeline_processing(tokentx_df, token_addr, origin_from_col='ORIGIN_FROM_ADDRESS',
#                                   origin_to_col='ORIGIN_TO_ADDRESS', contract_address_col="CONTRACT_ADDRESS",
#                                   resolution= 1.5, min_cluster_size=1000)

Volume processing done ✅
Unique interactions processing done ✅
Activity processing done ✅
Cluster processing done ✅
All done ✅


In [None]:
# native_test_data = native_test[~native_test['address'].isin(seed_df['eoa'])]
# token_test_data = token_test[~token_test['address'].isin(seed_df['eoa'])]
# print("Filtered shape:", native_test_data.shape)
# print("Original shape:", native_test.shape)
# print("Filtered shape:", token_test_data.shape)
# print("Original shape:", token_test.shape)

Filtered shape: (46011, 10)
Original shape: (52719, 10)


In [22]:
# native_test_data.to_csv("dataset/processed_data/native_test_data.csv", index=False)
# token_test_data.to_csv("dataset/processed_data/token_test_data.csv", index=False)

In [5]:
native_test = pd.read_csv("dataset/processed_data/native_test_data.csv")
X_native_test = native_test.drop(columns=['address'])
native_test_addr = native_test.address

In [44]:
token_test = pd.read_csv("dataset/processed_data/token_test_data.csv")
X_token_test = native_test.drop(columns=['address'])
token_test_addr = token_test.address

In [45]:
common_addresses = set(native_test["address"]).intersection(set(token_test["address"]))
print(f"{len(common_addresses)} addresses in common out of {len(native_test_addr) + len(token_test_addr)} unique addresses in both sets")

9783 addresses in common out of 98514 unique addresses in both sets


## 4. Testing our best model on test set. 
### We'll procede to test our best model, the already downloaded Random Forest, and use it to classify our test data.

In [7]:
rf_best = load('results/random_forest_model.pkl')

In [8]:
native_y_proba = rf_best.predict_proba(X_native_test)

In [10]:
native_results = prediction_df(native_y_proba, native_test_addr)

In [42]:
native_results.to_csv("predictions/native_dataset_predictions.csv")