### Step 1: Model Selection for Embeddings

In [None]:

from transformers import AutoTokenizer, AutoModel
import torch

# Load models and tokenizers
source_code_model = AutoModel.from_pretrained("microsoft/codebert-base")
source_code_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

bytecode_model = AutoModel.from_pretrained("distilbert-base-uncased")
bytecode_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def get_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)
    

### Step 2: Generate Embeddings

In [None]:

# Load the dataset
train_df = train_set.to_pandas()

# Example usage for source code embeddings
train_df['source_code_embedding'] = train_df['source_code'].apply(lambda x: get_embeddings(x, source_code_model, source_code_tokenizer).numpy())

# Example usage for bytecode embeddings
train_df['bytecode_embedding'] = train_df['bytecode'].apply(lambda x: get_embeddings(x, bytecode_model, bytecode_tokenizer).numpy())


### Step 3: Clustering

In [None]:

from sklearn.cluster import KMeans
import numpy as np

# Combine embeddings for clustering
embeddings = np.vstack([np.concatenate([row['source_code_embedding'], row['bytecode_embedding']]) for _, row in train_df.iterrows()])

# Perform clustering
kmeans = KMeans(n_clusters=5, random_state=0).fit(embeddings)
train_df['cluster'] = kmeans.labels_

# Display cluster assignments
print(train_df[['address', 'cluster']])
