# Generate Graph Embeddings (Node2Vec)

This notebook generates Node2Vec embeddings for the Elliptic++ dataset.

**Steps:**
1. Load Elliptic++ dataset with temporal splits
2. Generate Node2Vec embeddings per split (no leakage)
3. Save embeddings with txId mapping

**Output:** `data/embeddings.parquet`

In [None]:
import sys
sys.path.append('..')

import torch
import pandas as pd
import yaml
from pathlib import Path

from src.data.elliptic_loader import EllipticDataset
from src.data.verify_dataset import verify_dataset
from src.embeddings.node2vec import generate_node2vec_embeddings
from src.utils.seed import set_all_seeds

In [None]:
# Load config
with open('../configs/embed_node2vec.yaml', 'r') as f:
    config = yaml.safe_load(f)

set_all_seeds(config['seed'])
print(f"Config loaded: {config['experiment']}")

In [None]:
# Verify dataset
data_root = Path(config['data']['root'])
success, messages = verify_dataset(data_root)
for msg in messages:
    print(msg)

if not success:
    raise FileNotFoundError("Dataset incomplete - please provide all required files")

In [None]:
# Load dataset
dataset = EllipticDataset(data_root, use_local_only=True)
print(f"Dataset loaded with {len(dataset.features_df)} nodes")

In [None]:
# Generate embeddings (split-aware to prevent leakage)
n2v_config = config['node2vec']

# For each split, generate embeddings using only within-split edges
all_embeddings = []

for split_name in ['train', 'val', 'test']:
    print(f"\n=== Generating embeddings for {split_name} ===")
    
    features, labels, edge_index_split = dataset.get_split_data(split_name)
    
    # Convert to torch
    edge_index_t = torch.from_numpy(edge_index_split).long()
    
    # Generate embeddings
    embeddings = generate_node2vec_embeddings(
        edge_index=edge_index_t,
        num_nodes=len(features),
        embedding_dim=n2v_config['embedding_dim'],
        walk_length=n2v_config['walk_length'],
        context_size=n2v_config['context_size'],
        walks_per_node=n2v_config['walks_per_node'],
        p=n2v_config['p'],
        q=n2v_config['q'],
        epochs=config['training']['epochs'],
        device=config['device']
    )
    
    all_embeddings.append(embeddings)

# Concatenate all splits
full_embeddings = pd.concat(all_embeddings, ignore_index=True)

In [None]:
# Save embeddings with txId
embeddings_df = pd.DataFrame(
    full_embeddings,
    columns=[f'emb_{i}' for i in range(n2v_config['embedding_dim'])]
)
embeddings_df.insert(0, 'txId', dataset.features_df['txId'])

output_path = Path(config['output']['save_path'])
output_path.parent.mkdir(parents=True, exist_ok=True)
embeddings_df.to_parquet(output_path, index=False)

print(f"\nâœ… Embeddings saved to {output_path}")
print(f"   Shape: {embeddings_df.shape}")