# Semantic vector search for c code projects using microsoft/UniXcoder tokernizer

1. Setup environment variables

In [1]:
# Your Proxy settings

2. Load Model to GPU (if available)

In [2]:
import torch
import torch.cuda
import torch.mps

from unixcoder import UniXcoder

if torch.cuda.is_available():
    device = "cuda"
elif torch.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Model used is: {device}")

model =  UniXcoder("microsoft/unixcoder-base-nine")
model.to(device)



Model used is: mps


UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

3. Get C functions from project

In [3]:
from main import get_project_functions
from pathlib import Path

project_root_dir = Path.cwd() / Path("content/neovim/src")

project_functions = list()

if project_root_dir.exists():
  project_functions = get_project_functions(project_root_dir)

Found 8000 functions in project directory /Users/carlos/Desktop/Workspace/python/vector_search_c/content/neovim/src


4. Tokenize project functions 

In [4]:
tokens_ids = model.tokenize(project_functions, max_length=512, mode="<encoder-only>", padding=True)

print(len(tokens_ids))

8000


5. Create embeddings for the functions. This is achieved creating tensors in the choosen device. This can consume a lot of memory device quickly, so we need to batch tensors.

In [10]:
import math

def get_batch(my_list, batch_size):
  for i in range(0, len(my_list), batch_size):
    yield my_list[i:i + batch_size]

BATCH_SIZE = 32

count = 1

for token_batch in get_batch(tokens_ids, BATCH_SIZE):

  if device == "cuda":
    torch.cuda.empty_cache()
  elif device == "mps":
    torch.mps.empty_cache()

  print(f"Batch {count} of {math.ceil(len(tokens_ids)/BATCH_SIZE)}")
  
  batch_token_tensor = torch.tensor(token_batch).to(device)
  
  batch_token_embeddings, batch_function_embeddings = model(batch_token_tensor)

  del batch_token_embeddings
  del batch_function_embeddings

  count += 1

print("Done!")


Batch 1 of 250
Batch 2 of 250
Batch 3 of 250
Batch 4 of 250
Batch 5 of 250
Batch 6 of 250
Batch 7 of 250
Batch 8 of 250
Batch 9 of 250
Batch 10 of 250
Batch 11 of 250
Batch 12 of 250
Batch 13 of 250
Batch 14 of 250
Batch 15 of 250
Batch 16 of 250
Batch 17 of 250
Batch 18 of 250
Batch 19 of 250
Batch 20 of 250
Batch 21 of 250


KeyboardInterrupt: 