# Semantic vector search for c code projects using microsoft/UniXcoder tokernizer

1. Setup environment variables

In [2]:
# Your Proxy settings

2. Load Model to GPU (if available)

In [None]:
import torch
import torch.cuda
import torch.mps

from unixcoder import UniXcoder

if torch.cuda.is_available():
    device = "cuda"
elif torch.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Device used is: {device}")

model =  UniXcoder("microsoft/unixcoder-base-nine")
model.to(device)

3. Get C functions from project

In [29]:
from main import get_project_functions
from pathlib import Path

project_root_dir = Path("your path")

project_functions = list()

if project_root_dir.exists():
  project_functions = get_project_functions(project_root_dir)
else:
  print("No project found")

4. Tokenize project functions 

In [30]:
tokens_ids = model.tokenize(project_functions, max_length=512, mode="<encoder-only>", padding=True)

print(len(tokens_ids))

5. Create embeddings for the functions. This is achieved creating tensors in the choosen device. This can consume a lot of memory device quickly, so we need to batch tensors.

In [31]:
import torch.nn.functional as F

def get_batch(my_list, batch_size):
  for i in range(0, len(my_list), batch_size):
    yield my_list[i:i + batch_size]

BATCH_SIZE = 32

function_idx = 0

function_embeddings = []

for token_batch in get_batch(tokens_ids, BATCH_SIZE):
  
  batch_token_tensor = torch.tensor(token_batch).to(device)
  
  batch_token_embeddings, batch_function_embeddings = model(batch_token_tensor)

  # Normalize tensor with L2 norm
  batch_function_embeddings = F.normalize(batch_function_embeddings, p=2, dim=1)

  # Create embedding tensor list for FAISS indexing
  batch_function_embeddings = batch_function_embeddings.tolist()

  for function_embedding in batch_function_embeddings:
    text_embedding = (project_functions[function_idx], function_embedding)
    function_idx += 1
    function_embeddings.append(text_embedding)

print(function_embeddings[0])


6. Configure FAISS vector store

In [32]:
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.embeddings import Embeddings
from typing import cast

class MyEmbeddings(Embeddings):
  def embed_documents(self, texts: list[str]) -> list[list[float]]:
    tokens_ids = model.tokenize(texts, max_length=512, mode="<encoder-only>", padding=True)

    text_embeddings: list[list[float]] = []

    for token_batch in get_batch(tokens_ids, BATCH_SIZE):
      batch_token_tensor = torch.tensor(token_batch).to(device)
      
      batch_token_embeddings, batch_text_embeddings = model(batch_token_tensor)

      # Normalize tensor with L2 norm
      batch_text_embeddings = F.normalize(batch_text_embeddings, p=2, dim=1)
      
      # Create embedding tensor list for FAISS indexing
      batch_text_embeddings = cast(list[list[float]], batch_text_embeddings.tolist())

      text_embeddings.extend(batch_text_embeddings)

    return text_embeddings

  def embed_query(self, text: str) -> list[float]:
    embedding = self.embed_documents([text])[0]
    print(len(embedding))

    return embedding


vector_store = FAISS.from_embeddings(text_embeddings=function_embeddings, embedding=MyEmbeddings(), distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)

7. Query Index

In [None]:
top_k = 5
query = "your query"

query_results = vector_store.similarity_search_with_relevance_scores(
  query=query,
  k = top_k
)

print(len(query_results))

print("Query:", query)
print(f"Top {top_k} most similar functions in project:")
for query_entry in query_results[::-1]:
  print(f"(Score: {query_entry[1]:.4f})\n", query_entry[0])
  print("==============")