<a href="https://colab.research.google.com/github/BayramovaNazrin/Elliptic2-ML/blob/main/02_node_embeddings_to_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
import ast # Used for safely converting string representation of a list to an actual list

# --- Configuration and Setup ---
# Define file paths
EMBEDDINGS_FILE = '/content/drive/MyDrive/elliptic2/node_embeddings.csv'
NODES_MAPPING_FILE = '/content/drive/MyDrive/elliptic2/nodes.csv'
COMPONENTS_LABELS_FILE = '/content/drive/MyDrive/elliptic2/connected_components.csv'
OUTPUT_FILE = '/content/node_dataset.csv'

# Define assumed embedding dimension
EMBEDDING_DIM = 64
LABEL_COLUMN_NAME = 'label' # Standardized final label column name

# --- 1. Load the Node2Vec embeddings ---
print("1. Loading Node2Vec embeddings from temporary storage...")
try:
    # Read the embeddings file
    df_embeddings = pd.read_csv(EMBEDDINGS_FILE)
    df_embeddings.rename(columns={'node_id': 'clId'}, inplace=True)
    print(f"Loaded {len(df_embeddings)} embeddings.")
except FileNotFoundError:
    print(f"Error: {EMBEDDINGS_FILE} not found. Please run the previous script first.")
    # Exit gracefully if the file is missing
    # You may need to mount Drive here if the previous script was not run.
    drive.mount('/content/drive')
    exit()

# --- 2. Load the nodes mapping (clId -> subgraph_id) ---
print("2. Loading node mapping (clId -> subgraph_id) from Drive...")
try:
    # We only need 'clId' (node ID) and 'subgraph_id' (which you called ccId)
    df_nodes = pd.read_csv(NODES_MAPPING_FILE, usecols=['clId', 'ccId'])
    print(f"Loaded {len(df_nodes)} node mappings.")
except FileNotFoundError:
    print(f"Error: {NODES_MAPPING_FILE} not found. Please check the path.")
    exit()

# --- 3. Load the connected components labels (subgraph_id -> ccLabel) ---
print("3. Loading connected components labels (subgraph_id -> label) from Drive...")
try:
    # We only need 'subgraph_id' and 'ccLabel'
    df_labels = pd.read_csv(COMPONENTS_LABELS_FILE, usecols=['ccId', 'ccLabel'])
    print(f"Loaded {len(df_labels)} component labels.")
except FileNotFoundError:
    print(f"Error: {COMPONENTS_LABELS_FILE} not found. Please check the path.")
    exit()

# --- 4. Merge the datasets ---
print("\n4. Merging datasets...")

# First, merge embeddings with the node-to-subgraph mapping (df_embeddings + df_nodes)
df_merged = pd.merge(df_embeddings, df_nodes, on='clId', how='left')
print(f"Nodes after first merge: {len(df_merged)}")

# Second, merge with the subgraph-to-label mapping (df_merged + df_labels)
# Note: 'subgraph_id' in df_nodes is the 'ccId' reference
df_final = pd.merge(df_merged, df_labels, on='ccId', how='left')
print(f"Nodes after final merge: {len(df_final)}")

# Cleanup intermediate DataFrames
del df_embeddings, df_nodes, df_labels, df_merged

# --- 5. Remove nodes with unknown labels (keeping only 'licit' and 'illicit') ---
print("5. Filtering out nodes with 'unknown' labels...")

# Filter the dataset
df_final = df_final[df_final['ccLabel'].isin(['licit', 'illicit'])]
print(f"Remaining nodes for classification: {len(df_final)}")

# Rename the label column to the standardized name
df_final.rename(columns={'ccLabel': LABEL_COLUMN_NAME}, inplace=True)

# --- 6. Convert the 'embedding' column and expand into separate columns ---
print("\n6. Converting string embeddings to feature columns...")

# A. Convert string representation of list to actual list/array
# We use the literal_eval from the 'ast' module for safe string evaluation.
df_final['embedding'] = df_final['embedding'].apply(ast.literal_eval)

# B. Expand the list of floats into separate columns
# Create a DataFrame from the list of embeddings
df_emb_expanded = pd.DataFrame(
    df_final['embedding'].to_list(),
    index=df_final.index, # Crucial to align with the original DataFrame index
    columns=[f'emb_{i}' for i in range(EMBEDDING_DIM)]
)

print(f"Expanded embedding dimension: {df_emb_expanded.shape[1]}")


# C. Concatenate the expanded embeddings with the original DataFrame
df_final = pd.concat([df_final.drop(columns=['embedding', 'ccId']), df_emb_expanded], axis=1)

# --- 7. Save the resulting dataset ---
print("\n7. Saving the final dataset to CSV...")

# Reorder columns to place node_id and label first for clarity
column_order = ['clId'] + [f'emb_{i}' for i in range(EMBEDDING_DIM)] + [LABEL_COLUMN_NAME]
df_final = df_final[column_order]

# Save the final cleaned, ready-to-use dataset
df_final.to_csv(OUTPUT_FILE, index=False)

print(f"Final dataset shape: {df_final.shape}")
print(f"Successfully saved the final dataset to {OUTPUT_FILE}.")
print("\nScript finished successfully! The data is now ready for machine learning.")

1. Loading Node2Vec embeddings from temporary storage...
Loaded 716 embeddings.
2. Loading node mapping (clId -> subgraph_id) from Drive...
Loaded 444521 node mappings.
3. Loading connected components labels (subgraph_id -> label) from Drive...
Loaded 121810 component labels.

4. Merging datasets...
Nodes after first merge: 716
Nodes after final merge: 716
5. Filtering out nodes with 'unknown' labels...
Remaining nodes for classification: 696

6. Converting string embeddings to feature columns...
Expanded embedding dimension: 64

7. Saving the final dataset to CSV...
Final dataset shape: (696, 66)
Successfully saved the final dataset to /content/node_dataset.csv.

Script finished successfully! The data is now ready for machine learning.
