In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import json
import os

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load CodeBERT
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base").to(device)

In [None]:
def generate_embedding(code):
    inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Example
code = "def hello(): print('Hello')"
emb = generate_embedding(code)
print(emb.shape)

In [None]:
# Load code files
from src.extraction.load_code import load_code_files
from src.extraction.clean_code import clean_files

files = load_code_files("../data/raw/")
cleaned = clean_files(files)

embeddings = {}
for name, code in cleaned.items():
    emb = generate_embedding(code)
    embeddings[name] = emb.tolist()

# Save
os.makedirs("../data/embeddings/", exist_ok=True)
with open("../data/embeddings/codebert_embeddings.json", "w") as f:
    json.dump(embeddings, f)