In [2]:
# setting up the enviornment and random seed for reproducibility
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import math
import string
# !pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util
import torch
import pickle

# set random seed
import random
random.seed(42)
np.random.seed(42)

torch.use_deterministic_algorithms(True)
torch.cuda.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.manual_seed(42)

<torch._C.Generator at 0x7f729405fe30>

In [3]:
# # check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
query = "priority queue"

# Load in the CodeSearchNet Python sub-dataset

In [5]:
dataset_py = load_dataset("code_search_net", "python")
train_py = pd.DataFrame(dataset_py['train'])

ground_truth = pd.read_csv('annotationStore.csv')
gt_py = ground_truth[ground_truth['Language'] == 'Python']

merged_py = gt_py.merge(train_py, left_on = 'GitHubUrl', right_on = 'func_code_url')
merged_py.shape

  table = cls._concat_blocks(blocks, axis=0)


(990, 16)

# Indentifier + Doc embedding with all-MiniLM

- The natural language part was designed as a combination of the function identifier and docstrings

In [6]:
model_doc = SentenceTransformer('all-MiniLM-L6-v2')
sample_doc = train_py['func_name'] + " " + train_py['func_documentation_string']

**Embed the documentation part and save it as 'doc_emb.pt' for easier future access**

In [7]:
# doc_emb = model_doc.encode(sample_doc, convert_to_tensor=True)
# torch.save(doc_emb, 'doc_emb.pt')

**Loading in the saved embedding**

In [8]:
doc_emb = torch.load('doc_emb.pt')

In [19]:
## calculate similarity scores between query embedding and doc
## embedding, adding weight into the final score for combination purpose.
def weighted_doc(query, k, weight, param=3):
    que_doc_emb = model_doc.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(que_doc_emb, doc_emb)[0]
    if param*k > len(hits):
        top_hits = hits
        num_out = len(hits)
    else:
        top_hits = hits[:param*k]
        num_out = param*k

    rank = 1
    que_doc_df = pd.DataFrame()
    que_doc_score = []
    que_doc_url = []
    que_doc_rank = []
    for top_hit in top_hits:
        que_doc_score.append(weight*float(top_hit['score']))
        que_doc_url.append(train_py.iloc[top_hit['corpus_id']]['func_code_url'])
        que_doc_rank.append(rank)
        rank += 1
    que_doc_df['score'] = que_doc_score
    que_doc_df['url'] = que_doc_url
    return que_doc_df

In [10]:
weighted_doc(query, 5, 1)

Unnamed: 0,score,url
0,0.763506,https://github.com/flaviogrossi/sockjs-cyclone...
1,0.745125,https://github.com/Jaymon/prom/blob/b7ad2c259e...
2,0.736036,https://github.com/Jaymon/prom/blob/b7ad2c259e...
3,0.731232,https://github.com/keon/algorithms/blob/4d6569...
4,0.718924,https://github.com/SKA-ScienceDataProcessor/in...
5,0.71555,https://github.com/Erotemic/utool/blob/3b27e1f...
6,0.703601,https://github.com/SKA-ScienceDataProcessor/in...
7,0.702734,https://github.com/nerdvegas/rez/blob/1d3b846d...
8,0.699726,https://github.com/pinax/django-mailer/blob/12...
9,0.698709,https://github.com/limpyd/redis-limpyd-jobs/bl...


# Code embedding with GraphCodeBERT

Sources:
- https://github.com/microsoft/CodeBERT/tree/master/GraphCodeBERT/codesearch
- https://openreview.net/pdf?id=jLoC4ez43PZ (GRAPHCODEBERT: PRE-TRAINING CODE REPRESENTATIONS WITH DATA FLOW)

In [11]:
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn

In [20]:
# Load the data
train_data = train_py
sample_code = list(train_data['whole_func_string'])
sample_url = train_data['func_code_url']
sample_document = list(train_data['func_documentation_string'])


### ##________The GraphCodeBERT embedding process. Skip to load trained embeddings directionly______##

Clean the code string following the training idea of GraphCodeBERT:
- Remove all the docstrings and comments in the code string

In [12]:
## perform clean code, removing documentation and comments
cleaned_codes = []
for codestr in sample_code:
    pattern_doc = r'\"\"\"(.*?)\"\"\"'
    pattern_comm = r'#[^\n]*'
    clean_code = re.sub(pattern_doc, '', codestr, flags=re.DOTALL)
    clean_code = re.sub(pattern_comm, '', clean_code, flags=re.DOTALL)

    cleaned_codes.append(clean_code)

In [None]:
# Load the tokenizer and model
tokenizer_code = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model_code = RobertaModel.from_pretrained("demo/python_model").to(device)

# Load the data
train_data = train_py
sample_code = cleaned_codes

# Tokenize and encode the query
encoded_query = tokenizer_code(query, return_tensors='pt', truncation=True, max_length=512).to(device)
with torch.no_grad():
    query_vec = model_code(**encoded_query)[1]  # Get the pooled output

code_vecs = []
codes = []

# Tokenize and encode the code snippets
for code in sample_code:
    encoded_code = tokenizer_code(code, return_tensors='pt', truncation=True, max_length=512).to(device)
    with torch.no_grad():
        cur_code_vec = model_code(**encoded_code)[1]  # Get the pooled output
    code_vecs.append(cur_code_vec)
    codes.append(code)

# Concatenate the code vectors and move to the same device as the query vector
code_vecs = torch.cat(code_vecs).to(device)


Store the embedding into `clean_code_emb.pt` for faster future access

In [None]:
torch.save(code_vecs, 'clean_code_emb.pt')

###  ## ______________ Load in the embedded code matrix_________##

In [17]:
py_emb_mat = torch.load('clean_code_emb.pt')

In [18]:
py_emb_mat

tensor([[-0.3143, -0.0544, -0.4870,  ..., -0.8017,  0.0938, -0.1380],
        [ 0.2373, -0.2394, -0.3631,  ..., -0.1189,  0.1311, -0.5173],
        [ 0.2249, -0.2622, -0.0532,  ..., -0.1795,  0.2224, -0.4343],
        ...,
        [-0.5379,  0.2046, -0.6276,  ..., -0.7098, -0.0771, -0.4519],
        [-0.0073, -0.1181, -0.7433,  ..., -0.4311,  0.3418, -0.2338],
        [ 0.0192, -0.2119, -0.8511,  ..., -0.4541,  0.5451, -0.4459]],
       device='cuda:0')

In [21]:
# Load the tokenizer and model
tokenizer_code = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model_code = RobertaModel.from_pretrained("python_model").to(device)

In [23]:
## Calculate the similarity score (cosine sim) between query embedding
## and code embeddings. Add weight into the score for combining with
## doc score.

def weighted_code(query, emb_mat, k, weight, param=3):
    """ 
    Inputs: natural language query, the stored embedded matrix for training code.
    Outputs: a dataframe with the top-k matches from the training code 
    
    """
    # Tokenize and encode the query
    encoded_query = tokenizer_code(query, return_tensors='pt', truncation=True, max_length=512).to(device)
    with torch.no_grad():
        query_vec = model_code(**encoded_query)[1] # Get the pooled output
    
    code_vecs = emb_mat
    
    # Calculate the cosine similarities
    query_vec - query_vec.view(1,-1).expand_as(code_vecs)
    scores = nn.functional.cosine_similarity(query_vec, code_vecs, dim=1)
    
    
    # Get the top 5 scores and their indices
    top_scores, top_indices = torch.topk(scores, param*k, largest=True)
    
    
    # Retrieve the top 5 most relevant code snippets using the indices
    top_code_snippets = [sample_url[index] for index in top_indices.cpu().numpy()]
    
    rank = 1
    que_code_df = pd.DataFrame()
    que_code_score = []
    que_code_url = []
    que_code_rank = []
    
    for score,url in zip(top_scores, top_code_snippets):
        que_code_score.append(weight*float(score.cpu()))
        que_code_url.append(url)
        que_code_rank.append(rank)
        rank += 1
    
    que_code_df['score'] = que_code_score
    que_code_df['url'] = que_code_url
    
    return que_code_df

In [24]:
weighted_code(query, py_emb_mat, 5, 1)

Unnamed: 0,score,url
0,0.765143,https://github.com/hubo1016/vlcp/blob/23905522...
1,0.738386,https://github.com/fchauvel/MAD/blob/806d51748...
2,0.723233,https://github.com/rackerlabs/rackspace-python...
3,0.722229,https://github.com/rfk/threading2/blob/7ec234d...
4,0.713473,https://github.com/calston/rhumba/blob/05e3cbf...
5,0.705459,https://github.com/istresearch/scrapy-cluster/...
6,0.705304,https://github.com/calston/rhumba/blob/05e3cbf...
7,0.703363,https://github.com/limpyd/redis-limpyd-jobs/bl...
8,0.700093,https://github.com/calston/rhumba/blob/05e3cbf...
9,0.697995,https://github.com/axialmarket/fsq/blob/43b84c...


### We have now finished the construction of natural language embedding and code embedding individually. Switch to combination.

# Combining the Two

In [25]:
# the two functions will each return 3*k of result, if there are enough of them, 
# with weighted score of each. In this step we combine them through weighted score 
# and return the final k matches.
def nlangCode(query, code_vecs, weight, k):
    w1 = weight
    w2 = 1-w1
    df1 = weighted_doc(query, k, w1)
    df2 = weighted_code(query, code_vecs, k, w2)
    
    # join df1 and df2.
    combined = df1.merge(df2, how='outer', left_on='url', right_on='url').fillna(0)
    combined['avg_score'] = combined['score_x'] + combined['score_y']
    combined.sort_values(by='avg_score', inplace=True, ascending = False)
    
    combined['query']=[query for _ in range(len(combined))]
    combined['language'] = ['Python' for _ in range(len(combined))]
    return combined[:k][['language', 'query', 'url']]
    


In [27]:
nlangCode(query, py_emb_mat, 0.0, 10)

Unnamed: 0,language,query,url
10,Python,priority queue,https://github.com/hubo1016/vlcp/blob/23905522...
11,Python,priority queue,https://github.com/fchauvel/MAD/blob/806d51748...
12,Python,priority queue,https://github.com/rackerlabs/rackspace-python...
13,Python,priority queue,https://github.com/rfk/threading2/blob/7ec234d...
14,Python,priority queue,https://github.com/calston/rhumba/blob/05e3cbf...
15,Python,priority queue,https://github.com/istresearch/scrapy-cluster/...
16,Python,priority queue,https://github.com/calston/rhumba/blob/05e3cbf...
9,Python,priority queue,https://github.com/limpyd/redis-limpyd-jobs/bl...
17,Python,priority queue,https://github.com/calston/rhumba/blob/05e3cbf...
18,Python,priority queue,https://github.com/axialmarket/fsq/blob/43b84c...


# Evaluation - Using relevanceeval.py in CodeSearchNet 
Source Code:
- https://github.com/github/CodeSearchNet/blob/master/src/relevanceeval.py

In [28]:
annotation_py = merged_py[['Language', 'Query', 'GitHubUrl', 'Relevance', 'Notes']]
annotation_py.to_csv('annotation_py.csv')

In [29]:
# create the prediction results csv
queries = pd.read_csv('queries.csv')

In [56]:
## grid search for best weight dstribution
def best_weights():
    best_param = -1
    best_score = -1
    
    thresholds = np.linspace(0.015,0.025,10)
    for thres in thresholds:
        out_dfs = []
        for query in queries['query'].values:
            out_df = nlangCode(query, py_emb_mat, thres, 10) #py_codebert(query, code_vecs, 10)
            out_dfs.append(out_df)
        trainset_result = pd.concat(out_dfs, ignore_index=True)
        trainset_result.to_csv('train_py_result.csv')
    
        result = !python relevanceeval.py annotationStore.csv train_py_result.csv
        ndcg = float(result[5].split()[1])

        if ndcg > best_score:
            best_score = ndcg
            best_param = thres
    return best_param, best_score

In [59]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [60]:
best_weight, best_ndcg = best_weights()
best_weight, best_ndcg

(0.017222222222222222, 0.314)

In [42]:
trainset_result

Unnamed: 0,language,query,url
0,Python,convert int to string,https://github.com/tensorflow/datasets/blob/46...
1,Python,convert int to string,https://github.com/django-treebeard/django-tre...
2,Python,convert int to string,https://github.com/romankoblov/leaf/blob/e042d...
3,Python,convert int to string,https://github.com/django-treebeard/django-tre...
4,Python,convert int to string,https://github.com/tensorflow/datasets/blob/46...
...,...,...,...
985,Python,how to read .csv file in an efficient way?,https://github.com/Erotemic/utool/blob/3b27e1f...
986,Python,how to read .csv file in an efficient way?,https://github.com/gem/oq-engine/blob/8294553a...
987,Python,how to read .csv file in an efficient way?,https://github.com/lappis-unb/salic-ml/blob/1b...
988,Python,how to read .csv file in an efficient way?,https://github.com/dshean/pygeotools/blob/5ac7...


### Result for doc embedding only

In [61]:
out_dfs = []
for query in queries['query'].values:
    out_df = nlangCode(query, py_emb_mat, 1, 10) 
    out_dfs.append(out_df)
trainset_result = pd.concat(out_dfs, ignore_index=True)
trainset_result.to_csv('train_py_result.csv')

!python relevanceeval.py annotationStore.csv train_py_result.csv

% of URLs in predictions that exist in the annotation dataset:
	python: 7.18%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
	python: 7.85%
NDCG:
	python: 0.171
NDCG (full ranking):
	python: 0.114


### Result for code embedding only

In [62]:
out_dfs = []
for query in queries['query'].values:
    out_df = nlangCode(query, py_emb_mat, 0, 10) 
    out_dfs.append(out_df)
trainset_result = pd.concat(out_dfs, ignore_index=True)
trainset_result.to_csv('train_py_result.csv')

!python relevanceeval.py annotationStore.csv train_py_result.csv

% of URLs in predictions that exist in the annotation dataset:
	python: 15.87%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
	python: 17.18%
NDCG:
	python: 0.307
NDCG (full ranking):
	python: 0.218


- A combination of the two embeddings with 0.017222 on doc-embedding and 0.82778 on code-embedding results in an nDCG score of 0.314, higher than that of two individual embeddings which are 0.171 abd 0.307 respectively.  