# Evaluation 4

1. most popular 1000 Python Github repo
2. filter top N frequently asked questions on StackOverflow(17000->2000) (question viewed+votes)
3. Verify if the StackOverflow code snippet exist in 1000 repo
    - ElasticSearch
    - manually choose 100 questions from ElasticSearch result
4. use StackOverflow questions as input of the model, and manually evalute if the top 10 results has correct ansers

# Automated Evaluation 6

replaced the 4th step of the earlier evaluation methods with:<br>
- first taking the top 10 results retrieved by NCS, and for each retrieved method, getting a similarity score between the ground–truth code snippet and the method. 
- choose a threshold that minimize false positive
=>


In [1]:
from gensim.models import KeyedVectors
from time import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# change the path to target files
st=time()
path_wordembedding="data/embeddings.txt"
path_docembedding="data/document_embeddings.csv"
path_stackoverflow="data/stack_overflow/StackOverFlow.csv"

# change hyperparameters
vocab_size=500
window_size=5

#StackOverflow start id
start_idx=0
end_idx=10 #will actually run to end_idx-1

In [3]:
# load StackOverflow data
st=time()
df_stack_overflow=pd.read_csv(path_stackoverflow)
print("Dimension of StackOverflow data: {}".format(df_stack_overflow.shape))
print("Run time: {} s".format(time()-st))

Dimension of StackOverflow data: (1305, 6)
Run time: 0.07717013359069824 s


In [4]:
# load wordembedding: representation of words
st=time()
trained_ft_vectors = KeyedVectors.load_word2vec_format(path_wordembedding)
print("Run time: {} s".format(time()-st))

Run time: 1.0715551376342773 s


In [5]:
# load document embedding: representation of each source code function
st=time()
document_embeddings=np.loadtxt(fname=path_docembedding, delimiter=",")
print("Dimension of the document embedding: {}".format(document_embeddings.shape))
print("Run time: {} s".format(time()-st))

Dimension of the document embedding: (1038, 500)
Run time: 0.48323917388916016 s


In [6]:
# normalize a word represenatation vector that its L2 norm is 1.
# we do this so that the cosine similarity reduces to a simple dot product

def normalize(word_representations):
    for word in word_representations:
        total=0
        for key in word_representations[word]:
            total+=word_representations[word][key]*word_representations[word][key]
            
        total=math.sqrt(total)
        for key in word_representations[word]:
            word_representations[word][key]/=total

def dictionary_dot_product(dict1, dict2):
    dot=0
    for key in dict1:
        if key in dict2:
            dot+=dict1[key]*dict2[key]
    return dot

def find_sim(word_representations, query):
    if query not in word_representations:
        print("'%s' is not in vocabulary" % query)
        return None
    
    scores={}
    for word in word_representations:
        cosine=dictionary_dot_product(word_representations[query], word_representations[word])
        scores[word]=cosine
    return scores

# Find the K words with highest cosine similarity to a query in a set of word_representations
def find_nearest_neighbors(word_representations, query, K):
    scores=find_sim(word_representations, query)
    if scores != None:
        sorted_x = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
        for idx, (k, v) in enumerate(sorted_x[:K]):
            print("%s\t%s\t%.5f" % (idx,k,v))

In [7]:
def get_most_relevant_document(question, word_embedding, doc_embedding, num=10):
    """Return the functions that are most relevant to the natual language question.

    Args:
        question: A string. A Question from StackOverflow. 
        word_embedding: Word embedding generated from codebase.
        doc_embedding: Document embedding generated from codebase
        num: The number of top similar functions to return.

    Returns:
        A list of indices of the top NUM related functions to the QUESTION in the WORD_EMBEDDING.
    
    """
    # convert QUESTION to a vector
    tokenized_ques=question.split()
    vec_ques=np.zeros((1,document_embeddings.shape[1])) #vocab_size
    token_count=0
    has_token_in_embedding=False
    for token in tokenized_ques:
        if token in word_embedding:
            has_token_in_embedding=True
            vec_ques+=word_embedding[token]
            token_count+=1
    
    if has_token_in_embedding:
        mean_vec_ques=vec_ques/token_count
    
    
        # compute similarity between this question and each of the source code snippets
        cosine_sim=[]
        for idx, doc in enumerate(document_embeddings):
            #[TODO] fix dimension

            try:
                cosine_sim.append(cosine_similarity(mean_vec_ques, doc.reshape(1, -1))[0][0])
            except ValueError:
                print(question)
                print(vec_ques, token_count)
                print(mean_vec_ques)
                print(doc.reshape(1, -1))
        # get top `num` similar functions
        result_func_id=np.array(cosine_sim).argsort()[-num:][::-1]
        result_similarity=np.sort(np.array(cosine_sim))[-num:][::-1]
    else:
        result_func_id=np.nan
        result_similarity=np.nan
    return result_func_id, result_similarity

In [8]:
# limit number of questions
df_stack_overflow_partial=df_stack_overflow.iloc[start_idx:end_idx,:]

In [9]:
st=time()
list_most_relevant_doc=[]
list_most_relevant_sim=[]
for idx in range(len(df_stack_overflow_partial)): 
    question=df_stack_overflow_partial.iloc[idx]["Question_Title"]
    
    most_relevant_doc, most_relevant_sim=get_most_relevant_document(question, trained_ft_vectors, document_embeddings)
    list_most_relevant_doc.append(most_relevant_doc)
    list_most_relevant_sim.append(most_relevant_sim)
df_stack_overflow_partial["func_id"]=list_most_relevant_doc
df_stack_overflow_partial["sim"]=list_most_relevant_sim
print("Run time: {} s".format(time()-st)) 

Run time: 1.6584300994873047 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [10]:
# save result
df_stack_overflow_partial.to_pickle("data/SO_similarity_{}_{}.pkl".format(start_idx, end_idx))

In [11]:
df_stack_overflow_partial

Unnamed: 0,Post_Link_ID,Question_Score,Question_Title,Question_Content,Answer,Tags,func_id,sim
0,50607128,47,Creating a nested dictionary from a flattened ...,<p>I have a flattened dictionary which I want ...,def nest_dict(flat):\n result = {}\n for...,<python><dictionary><recursion><nested><netcdf>,"[377, 421, 448, 371, 456, 100, 920, 150, 346, ...","[0.9999984771395396, 0.9999984686083946, 0.999..."
1,45631715,36,Downloading with chrome headless and selenium,<p>I'm using python-selenium and Chrome 59 and...,"def enable_download_in_headless_chrome(self, d...",<python><google-chrome><selenium><google-chrom...,"[116, 416, 92, 1004, 258, 117, 102, 409, 430, ...","[0.9999992941886049, 0.9999992261907498, 0.999..."
2,43957860,17,Python unittest - Ran 0 tests in 0.000s,"<p>So I want to do this code <a href=""http://o...",def test_add_returns_zero_for_emptyString(self...,<python><unit-testing><python-unittest><python...,"[2, 932, 929, 107, 313, 258, 255, 477, 458, 290]","[0.9999963140780601, 0.9999960986804608, 0.999..."
3,44424040,16,Django logging custom attributes in formatter,<p>How can Django use logging to log using cus...,def add_my_custom_attribute(record):\n reco...,<python><django><logging>,"[2, 258, 530, 289, 108, 117, 932, 290, 288, 489]","[0.9999994069389221, 0.9999993391644111, 0.999..."
4,48019843,14,PCA on word2vec embeddings,<p>I am trying to reproduce the results of thi...,"def doPCA(pairs, embedding, num_components = 1...",<python><scikit-learn><nlp><pca><word2vec>,"[349, 348, 497, 494, 498, 516, 527, 518, 519, ...","[0.9999985864775749, 0.9999984819488087, 0.999..."
5,42381902,13,Interpreting negative Word2Vec similarity from...,<p>E.g. we train a word2vec model using <code>...,"def similarity(self, w1, w2):\n """"""\n Co...",<python><nlp><similarity><gensim><word2vec>,"[1011, 989, 990, 992, 993, 994, 991, 1003, 71,...","[0.9999976629195375, 0.999997407998317, 0.9999..."
6,43855162,13,RMSE/ RMSLE loss function in Keras,<p>I try to participate in my first Kaggle com...,"def root_mean_squared_error(y_true, y_pred):\n...",<python><keras><custom-function><loss-function>,"[2, 258, 290, 932, 289, 117, 288, 175, 150, 108]","[0.999999246929511, 0.9999992116031092, 0.9999..."
7,50253517,11,How to group functions without side effects?,<p>I have a function with several helper funct...,"def create_filled_template_in_temp(path, value...",<python>,"[433, 230, 973, 490, 425, 523, 515, 928, 477, ...","[0.9999991215290384, 0.999999073228489, 0.9999..."
8,43791970,11,Pandas: assigning columns with multiple condit...,<p>Edited:</p>\n\n<p>I have a financial portfo...,def closure():\n cur_weight = {}\n def f...,<python><pandas><dataframe><finance><portfolio>,"[116, 117, 108, 256, 543, 1004, 934, 427, 92, ...","[0.9999995954184046, 0.9999995363726762, 0.999..."
9,44293479,11,"Python, I'm repeating myself a lot when it com...",<p>Lets say I have three lists and I need to i...,def get_average(streaks):\n streak_0_num0s ...,<python><for-loop><dry>,"[419, 102, 101, 100, 104, 425, 363, 523, 372, ...","[0.9999995399367301, 0.9999995397426804, 0.999..."


# Check result

In [12]:
df_stack_overflow_partial=pd.read_pickle("data/SO_similarity_0_10.pkl")
df_stack_overflow_partial

Unnamed: 0,Post_Link_ID,Question_Score,Question_Title,Question_Content,Answer,Tags,func_id,sim
0,50607128,47,Creating a nested dictionary from a flattened ...,<p>I have a flattened dictionary which I want ...,def nest_dict(flat):\n result = {}\n for...,<python><dictionary><recursion><nested><netcdf>,"[377, 421, 448, 371, 456, 100, 920, 150, 346, ...","[0.9999984771395396, 0.9999984686083946, 0.999..."
1,45631715,36,Downloading with chrome headless and selenium,<p>I'm using python-selenium and Chrome 59 and...,"def enable_download_in_headless_chrome(self, d...",<python><google-chrome><selenium><google-chrom...,"[116, 416, 92, 1004, 258, 117, 102, 409, 430, ...","[0.9999992941886049, 0.9999992261907498, 0.999..."
2,43957860,17,Python unittest - Ran 0 tests in 0.000s,"<p>So I want to do this code <a href=""http://o...",def test_add_returns_zero_for_emptyString(self...,<python><unit-testing><python-unittest><python...,"[2, 932, 929, 107, 313, 258, 255, 477, 458, 290]","[0.9999963140780601, 0.9999960986804608, 0.999..."
3,44424040,16,Django logging custom attributes in formatter,<p>How can Django use logging to log using cus...,def add_my_custom_attribute(record):\n reco...,<python><django><logging>,"[2, 258, 530, 289, 108, 117, 932, 290, 288, 489]","[0.9999994069389221, 0.9999993391644111, 0.999..."
4,48019843,14,PCA on word2vec embeddings,<p>I am trying to reproduce the results of thi...,"def doPCA(pairs, embedding, num_components = 1...",<python><scikit-learn><nlp><pca><word2vec>,"[349, 348, 497, 494, 498, 516, 527, 518, 519, ...","[0.9999985864775749, 0.9999984819488087, 0.999..."
5,42381902,13,Interpreting negative Word2Vec similarity from...,<p>E.g. we train a word2vec model using <code>...,"def similarity(self, w1, w2):\n """"""\n Co...",<python><nlp><similarity><gensim><word2vec>,"[1011, 989, 990, 992, 993, 994, 991, 1003, 71,...","[0.9999976629195375, 0.999997407998317, 0.9999..."
6,43855162,13,RMSE/ RMSLE loss function in Keras,<p>I try to participate in my first Kaggle com...,"def root_mean_squared_error(y_true, y_pred):\n...",<python><keras><custom-function><loss-function>,"[2, 258, 290, 932, 289, 117, 288, 175, 150, 108]","[0.999999246929511, 0.9999992116031092, 0.9999..."
7,50253517,11,How to group functions without side effects?,<p>I have a function with several helper funct...,"def create_filled_template_in_temp(path, value...",<python>,"[433, 230, 973, 490, 425, 523, 515, 928, 477, ...","[0.9999991215290384, 0.999999073228489, 0.9999..."
8,43791970,11,Pandas: assigning columns with multiple condit...,<p>Edited:</p>\n\n<p>I have a financial portfo...,def closure():\n cur_weight = {}\n def f...,<python><pandas><dataframe><finance><portfolio>,"[116, 117, 108, 256, 543, 1004, 934, 427, 92, ...","[0.9999995954184046, 0.9999995363726762, 0.999..."
9,44293479,11,"Python, I'm repeating myself a lot when it com...",<p>Lets say I have three lists and I need to i...,def get_average(streaks):\n streak_0_num0s ...,<python><for-loop><dry>,"[419, 102, 101, 100, 104, 425, 363, 523, 372, ...","[0.9999995399367301, 0.9999995397426804, 0.999..."


In [19]:
df_stack_overflow_partial[df_stack_overflow_partial["Post_Link_ID"]==50607128]["Question_Title"]

0    Creating a nested dictionary from a flattened ...
Name: Question_Title, dtype: object

In [20]:
df_stack_overflow_partial[df_stack_overflow_partial["Post_Link_ID"]==50607128]["func_id"].tolist()

[array([377, 421, 448, 371, 456, 100, 920, 150, 346, 416])]

In [15]:
#df_py100k=pd.read_pickle("data/py100k.pkl")

In [16]:
#df_py100k[700000:700100]

In [17]:
#list_data_id=[]
#for i in [260771, 275794, 428754, 372502, 360950, 284871, 412289, 412286, 11140, 412288]:
#    #list_data_id.append(df_py100k.iloc[i]["data_id"])
#    print(df_py100k.iloc[i])
#    print()
##df_py100k.head()

In [18]:
#list_data_id