In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

# Loading the dataset

In [None]:
dataset_py = load_dataset("code_search_net", "python")
train_py = pd.DataFrame(dataset_py['train'])

ground_truth = pd.read_csv('annotationStore.csv')
gt_py = ground_truth[ground_truth['Language'] == 'Python']

merged_py = gt_py.merge(train_py, left_on = 'GitHubUrl', right_on = 'func_code_url')
merged_py.shape

## Identifier + Doc embedding

In [None]:
model_doc = SentenceTransformer('all-MiniLM-L6-v2')
sample_doc = train_py['func_name'] + " " + train_py['func_documentation_string']

In [None]:
que_doc_emb = model_doc.encode(query, convert_to_tensor=True)
hits = util.semantic_search(que_doc_emb, doc_emb)[0]
top_hits = hits[:10]
# cdist_score = cdist(que_doc_emb, doc_emb)
# top_hits = torch.topk(cdist_score, k=top_3)
rank = 1
que_doc_df = pd.DataFrame()
que_doc_score = []
que_doc_url = []
que_doc_rank = []
for top_hit in top_hits:
    print("Cossim: {:.2f}".format(top_hit['score']))
    print(f"Rank: {rank}")
    print(train_py.iloc[top_hit['corpus_id']]['func_code_url'])
    print("\n\n")
    
    que_doc_score.append(top_hit['score'])
    que_doc_url.append(train_py.iloc[top_hit['corpus_id']]['func_code_url'])
    que_doc_rank.append(rank)
    rank += 1

que_doc_df['score'] = que_doc_score
que_doc_df['rank'] = que_doc_rank
que_doc_df['url'] = que_doc_url
que_doc_df

## Code Embedding

In [None]:
model_code = SentenceTransformer("all-MiniLM-L6-v2")
sample_code = train_py['whole_func_string']

In [None]:
que_code_emb = model_code.encode(query, convert_to_tensor=True)
hits = util.semantic_search(que_code_emb, code_emb)[0]
top_hits = hits[:3]

for top_hit in top_hits:
    print("Cossim: {:.2f}".format(top_hit['score']))
    print(merged.iloc[top_hit['corpus_id']]['func_code_url'])
    print("\n\n")

## Tryouts - Pretrained CodeBERT(currently only on python)


In [None]:
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn

In [None]:
# Setting up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model
tokenizer_code = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model_code = RobertaModel.from_pretrained("demo/python_model").to(device)

# Load the data
# train_data = pd.read_csv('merged_py.csv')  # only on the ones with query and code pairs
train_data = train_py
sample_code = list(train_data['whole_func_string'])

# Tokenize and encode the query
encoded_query = tokenizer_code(query, return_tensors='pt', truncation=True, max_length=512).to(device)
with torch.no_grad():
    query_vec = model_code(**encoded_query)[1]  # Get the pooled output

code_vecs = []
codes = []

# Tokenize and encode the code snippets
for code in sample_code:
    encoded_code = tokenizer_code(code, return_tensors='pt', truncation=True, max_length=512).to(device)
    with torch.no_grad():
        cur_code_vec = model_code(**encoded_code)[1]  # Get the pooled output
    code_vecs.append(cur_code_vec)
    codes.append(code)

# Concatenate the code vectors and move to the same device as the query vector
code_vecs = torch.cat(code_vecs).to(device)

# Calculate the cosine similarities
scores = torch.einsum("ab,cb->ac", query_vec, code_vecs)
scores = torch.softmax(scores, -1)

# Get the top 5 scores and their indices
top_scores, top_indices = torch.topk(scores[0], 5, largest=True)

# Retrieve the top 5 most relevant code snippets using the indices
top_code_snippets = [sample_code[index] for index in top_indices.cpu().numpy()]

# Print the results
for score, snippet in zip(top_scores, top_code_snippets):
    print(f"Relevance Score: {score.item()}\nCode Snippet: {snippet}\n")


# Evaluation - nDCG

In [None]:
target = merged_py[merged_py['Query'] == query]
final = target.merge(que_doc_df, left_on = 'GitHubUrl', right_on = 'url')

In [3]:
# MRR = 1/final['rank']

In [None]:
#check nDCG calculation
# dataframe: final
def dcg(data, length, rank, relevance):
    scorces = []
    answers_len = len(data)
    
    for i in range(length):
        if i+1 in data['rank'].values:
            cur_data = data[data['rank'] == i+1]
            gain = cur_data['Relevance']
        else:
            gain = 0
        score = gain / np.log2(i+2)
        scores.append(score)
    return sum(scores)


# dataframe: merged, expected result, check how many expected results are in the train
# and should be returned
def idcg(data, query, length):
    scores = []
    expected = data[data['Query'] == query].sort_values(by ='Relevance', ascending=False)
    answers_len = len(expected)
    
    cal_length = min(answers_len, length)
    for i in range(cal_length):
        if i >= len(answers_len):
            gain = 0
        else:
            gain = expected.iloc[i]['Relevance']
        score = gain / np.log2(i+2)
        scores.append(score)
    return sum(scores)

dcg_score = dcg(final, 3)
idcg_score = idcg(merged_py, 3)
ndcg_score = dcg_score/idcg_score