<a href="https://colab.research.google.com/github/BayramovaNazrin/test1/blob/main/graph_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Load Data**

In [38]:
!git clone https://github.com/BayramovaNazrin/test1.git
%cd /content/test1

fatal: destination path 'test1' already exists and is not an empty directory.
/content/test1


In [39]:
import sys
sys.path.append('/content/test1')

from load_data import load_data
features, edges, classes, merged_df = load_data()

# **Imports**

In [40]:
try:
    import torch_geometric
except ImportError:
    import torch
    !pip install -q torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-{torch.__version__}.html


In [41]:
import pandas as pd
import numpy as np
import torch
import random
from torch_geometric.nn import Node2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.utils.multiclass import unique_labels
from load_data import load_data

# Reproducibility Control
Every run gives same results

In [42]:
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# **Preprocess graph**

In [43]:
#for updates, do not forget delete them
!git pull
import importlib, load_data
importlib.reload(load_data)

Already up to date.


<module 'load_data' from '/content/test1/load_data.py'>

In [44]:
classes = classes[classes['class'] != 'unknown'].copy()
classes['class'] = classes['class'].astype(int)

# --- Ensure consistent ID types ---
features['txId'] = features['txId'].astype(str)
classes['txId'] = classes['txId'].astype(str)
edges['txId1'] = edges['txId1'].astype(str)
edges['txId2'] = edges['txId2'].astype(str)

# --- Align features with labeled nodes ---
valid_nodes = classes['txId'].values
features = features[features['txId'].isin(valid_nodes)]
node_ids = features['txId'].values
node_id_map = {id_: i for i, id_ in enumerate(node_ids)}

# --- Filter edges ---
edges = edges[edges['txId1'].isin(node_id_map) & edges['txId2'].isin(node_id_map)]
print("Remaining edges:", edges.shape)

# --- Build graph tensor ---
import torch
edge_index = torch.tensor(
    [[node_id_map[src], node_id_map[dst]] for src, dst in zip(edges['txId1'], edges['txId2'])],
    dtype=torch.long
).t().contiguous()
print("edge_index shape:", edge_index.shape)


x = torch.tensor(features.iloc[:, 1:].values, dtype=torch.float)
y = torch.tensor(classes.set_index('txId').loc[node_ids, 'class'].values, dtype=torch.long)


Remaining edges: (36624, 2)
edge_index shape: torch.Size([2, 36624])


# **Split Dataset**

In [45]:
node_indices = np.arange(len(y))
train_idx, temp_idx = train_test_split(node_indices, test_size=0.4, random_state=SEED, stratify=y)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=SEED, stratify=y[temp_idx])

y_train, y_val, y_test = y[train_idx].numpy(), y[val_idx].numpy(), y[test_idx].numpy()

# **Train Node2Vec**

In [46]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
node2vec = Node2Vec(
    edge_index=edge_index,
    embedding_dim=128,
    walk_length=20,
    context_size=10,
    walks_per_node=10,
    num_negative_samples=1,
    p=1, q=1, sparse=True
).to(device)

loader = node2vec.loader(batch_size=128, shuffle=True, num_workers=2)
optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=0.01)

print("Training Node2Vec...")
for epoch in range(1, 6):
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

Training Node2Vec...
Epoch 1, Loss: 1581.2072
Epoch 2, Loss: 757.7323
Epoch 3, Loss: 450.1743
Epoch 4, Loss: 343.3462
Epoch 5, Loss: 308.9344


# **Combine embeddings + features**

In [47]:
embeddings = node2vec.embedding.weight.detach().cpu().numpy()
original_features = features.iloc[:, 1:].values
X_combined = np.concatenate([embeddings, original_features], axis=1)

X_train, X_val, X_test = X_combined[train_idx], X_combined[val_idx], X_combined[test_idx]


# **Train RandomForest classifier**

In [48]:
clf = RandomForestClassifier(
    n_estimators=500,
    random_state=SEED,
    class_weight={1: 5, 2: 1},
    max_depth=15,
    n_jobs=1
)
clf.fit(X_train, y_train)

# **Evaluate**

In [49]:
def safe_report(y_true, y_pred, y_prob=None):
    labels = unique_labels(y_true, y_pred)
    names = ['Licit' if l == 2 else 'Illicit' for l in labels]
    print(classification_report(y_true, y_pred, labels=labels,
                                target_names=names, digits=4, zero_division=0))
    if y_prob is not None:
        illicit_index = np.where(np.array(labels) == 1)[0]
        if illicit_index.size > 0:
            prob_illicit = y_prob[:, illicit_index[0]]
            roc_auc = roc_auc_score(y_true == 1, prob_illicit)
            pr_auc = average_precision_score(y_true == 1, prob_illicit)
            print(f"ROC-AUC: {roc_auc:.4f}")
            print(f"PR-AUC : {pr_auc:.4f}")

val_pred = clf.predict(X_val)
test_pred = clf.predict(X_test)
val_prob = clf.predict_proba(X_val)
test_prob = clf.predict_proba(X_test)

print("\n=== VALIDATION ===")
safe_report(y_val, val_pred, val_prob)
print("\n=== TEST ===")
safe_report(y_test, test_pred, test_prob)


=== VALIDATION ===
              precision    recall  f1-score   support

     Illicit     0.9937    0.8636    0.9241       909
       Licit     0.9855    0.9994    0.9924      8404

    accuracy                         0.9861      9313
   macro avg     0.9896    0.9315    0.9582      9313
weighted avg     0.9863    0.9861    0.9857      9313

ROC-AUC: 0.9948
PR-AUC : 0.9735

=== TEST ===
              precision    recall  f1-score   support

     Illicit     0.9962    0.8713    0.9296       909
       Licit     0.9863    0.9996    0.9929      8404

    accuracy                         0.9871      9313
   macro avg     0.9912    0.9355    0.9612      9313
weighted avg     0.9872    0.9871    0.9867      9313

ROC-AUC: 0.9950
PR-AUC : 0.9781
