# Evaluation 4

1. most popular 1000 Python Github repo
2. filter top N frequently asked questions on StackOverflow(17000->2000) (question viewed+votes)
3. Verify if the StackOverflow code snippet exist in 1000 repo
    - ElasticSearch
    - manually choose 100 questions from ElasticSearch result
4. use StackOverflow questions as input of the model, and manually evalute if the top 10 results has correct ansers

# Automated Evaluation 6

replaced the 4th step of the earlier evaluation methods with:<br>
- first taking the top 10 results retrieved by NCS, and for each retrieved method, getting a similarity score between the ground–truth code snippet and the method. 
- choose a threshold that minimize false positive
=>


In [None]:
from gensim.models import KeyedVectors
from time import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
# change the path to target files
st=time()
path_wordembedding="data/embeddings.txt"
path_docembedding="data/document_embeddings.csv"
path_stackoverflow="data/stack_overflow_data_all.csv"

# change hyperparameters
vocab_size=200
window_size=5

#StackOverflow start id
start_idx=0
end_idx=10 #will actually run to end_idx-1

In [None]:
# load StackOverflow data
st=time()
df_stack_overflow=pd.read_csv(path_stackoverflow)
print("Dimension of StackOverflow data: {}".format(df_stack_overflow.shape))
print("Run time: {} s".format(time()-st))

In [None]:
# load wordembedding: representation of words
st=time()
trained_ft_vectors = KeyedVectors.load_word2vec_format(path_wordembedding)
print("Run time: {} s".format(time()-st))

In [None]:
# load document embedding: representation of each source code function
st=time()
document_embeddings=np.loadtxt(fname=path_docembedding, delimiter=",")
print("Dimension of the document embedding: {}".format(document_embeddings.shape))
print("Run time: {} s".format(time()-st))

In [None]:
# normalize a word represenatation vector that its L2 norm is 1.
# we do this so that the cosine similarity reduces to a simple dot product

def normalize(word_representations):
    for word in word_representations:
        total=0
        for key in word_representations[word]:
            total+=word_representations[word][key]*word_representations[word][key]
            
        total=math.sqrt(total)
        for key in word_representations[word]:
            word_representations[word][key]/=total

def dictionary_dot_product(dict1, dict2):
    dot=0
    for key in dict1:
        if key in dict2:
            dot+=dict1[key]*dict2[key]
    return dot

def find_sim(word_representations, query):
    if query not in word_representations:
        print("'%s' is not in vocabulary" % query)
        return None
    
    scores={}
    for word in word_representations:
        cosine=dictionary_dot_product(word_representations[query], word_representations[word])
        scores[word]=cosine
    return scores

# Find the K words with highest cosine similarity to a query in a set of word_representations
def find_nearest_neighbors(word_representations, query, K):
    scores=find_sim(word_representations, query)
    if scores != None:
        sorted_x = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
        for idx, (k, v) in enumerate(sorted_x[:K]):
            print("%s\t%s\t%.5f" % (idx,k,v))

In [None]:
def get_most_relevant_document(question, word_embedding, doc_embedding, num=10):
    """Return the functions that are most relevant to the natual language question.

    Args:
        question: A string. A Question from StackOverflow. 
        word_embedding: Word embedding generated from codebase.
        doc_embedding: Document embedding generated from codebase
        num: The number of top similar functions to return.

    Returns:
        A list of indices of the top NUM related functions to the QUESTION in the WORD_EMBEDDING.
    
    """
    # convert QUESTION to a vector
    tokenized_ques=question.split()
    vec_ques=np.zeros((1,document_embeddings.shape[1])) #vocab_size
    token_count=0
    has_token_in_embedding=False
    for token in tokenized_ques:
        if token in word_embedding:
            has_token_in_embedding=True
            vec_ques+=word_embedding[token]
            token_count+=1
    
    if has_token_in_embedding:
        mean_vec_ques=vec_ques/token_count
    
    
        # compute similarity between this question and each of the source code snippets
        cosine_sim=[]
        for idx, doc in enumerate(document_embeddings):
            #[TODO] fix dimension

            try:
                cosine_sim.append(cosine_similarity(mean_vec_ques, doc.reshape(1, -1))[0][0])
            except ValueError:
                print(question)
                print(vec_ques, token_count)
                print(mean_vec_ques)
                print(doc.reshape(1, -1))
        # get top `num` similar functions
        result_func_id=np.array(cosine_sim).argsort()[-num:][::-1]
        result_similarity=np.sort(np.array(cosine_sim))[-num:][::-1]
    else:
        result_func_id=np.nan
        result_similarity=np.nan
    return result_func_id, result_similarity

In [None]:
# limit number of questions
df_stack_overflow_partial=df_stack_overflow.iloc[start_idx:end_idx,:]

In [None]:
st=time()
list_most_relevant_doc=[]
list_most_relevant_sim=[]
for idx in range(len(df_stack_overflow_partial)): 
    question=df_stack_overflow_partial.iloc[idx]["Question Title"]
    
    most_relevant_doc, most_relevant_sim=get_most_relevant_document(question, trained_ft_vectors, document_embeddings)
    list_most_relevant_doc.append(most_relevant_doc)
    list_most_relevant_sim.append(most_relevant_sim)
df_stack_overflow_partial["func_id"]=list_most_relevant_doc
df_stack_overflow_partial["sim"]=list_most_relevant_sim
print("Run time: {} s".format(time()-st)) 

In [None]:
# save result
df_stack_overflow_partial.to_pickle("data/SO_similarity_{}_{}.pkl".format(start_idx, end_idx))

In [None]:
df_stack_overflow_partial

# Check result

In [None]:
df_stack_overflow_partial=pd.read_pickle("data/SO_similarity_0_10.pkl")
df_stack_overflow_partial

In [None]:
df_stack_overflow_partial[df_stack_overflow_partial["Post Link"]==48211001]["Question Title"]

In [None]:
df_stack_overflow_partial[df_stack_overflow_partial["Post Link"]==48211001]["func_id"].tolist()

In [None]:
df_py100k=pd.read_pickle("data/py100k.pkl")

In [None]:
df_py100k[700000:700100]

In [None]:
list_data_id=[]
for i in [260771, 275794, 428754, 372502, 360950, 284871, 412289, 412286, 11140, 412288]:
    #list_data_id.append(df_py100k.iloc[i]["data_id"])
    print(df_py100k.iloc[i])
    print()
#df_py100k.head()

In [None]:
list_data_id