#### Note: This notebook focuses on documentation feature and evaluation using NDCG

In [73]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util
import torch
import snakecase

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Load in data

In [7]:
# train and test data
train_py = pd.read_csv('python_train_dataset.csv')
test_py = pd.read_csv('python_test_dataset.csv')

# ground truth data
labeled_dataset = pd.read_csv('annotationStore.csv')
labeled_py = labeled_dataset[labeled_dataset['Language'] == 'Python']

# 99 queries
queries = pd.read_csv('queries.csv')

# only contains labeled data for python that exists in train_py
merged_py = labeled_py.merge(train_py, left_on='GitHubUrl', right_on='func_code_url')

# func name + documentation string (func name not yet preprocessed)
doc = train_py['func_name'] + ' ' + train_py['func_documentation_string']
doc_emb = np.load('func_name_docu_doc_emb.npy')

### Evaluation (NDCG)

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
def return_relevant_docs(query, doc, doc_emb, k=10):
    que_doc_emb = model.encode(query, convert_to_tensor=True)
        
    results = util.semantic_search(que_doc_emb, doc_emb)[0]
    top_k_results = results[:k]
    rank = 1
    
    que_doc_df = pd.DataFrame()
    que_doc_score = []
    que_doc_url = []
    que_doc_rank = []
    for res in top_k_results:
        que_doc_score.append(res['score'])
        que_doc_rank.append(rank)
        que_doc_url.append(train_py.iloc[res['corpus_id']]['func_code_url'])
        github_url = train_py.iloc[res['corpus_id']]['func_code_url']
        
        rank += 1

    que_doc_df['score'] = que_doc_score # similarity score
    que_doc_df['rank'] = que_doc_rank # our search engine rank
    que_doc_df['url'] = que_doc_url # github url of our returned document
    
    return que_doc_df

In [11]:
def to_evaluate(doc, doc_emb, k=10):
    df_queries, df_urls = [], []
    for query in queries['query'].values:
        results = return_relevant_docs(query, doc, doc_emb, k=k)
        
        df_queries += [query] * k
        df_urls += list(results['url'].values)
        
    df_eval = pd.DataFrame({'language': ['Python'] * len(df_queries), 
                            'query': df_queries, 
                            'url': df_urls})
    
    return df_eval

In [13]:
results = to_evaluate(doc, doc_emb)

In [15]:
# save dataframe as csv file to run ndcg from codesearchnet
# results.to_csv('eval_results.csv')

In [19]:
annotation_store_py = merged_py[['Language', 'Query', 'GitHubUrl', 'Relevance', 'Notes']]
# annotation_store_py.to_csv('annotation_store_py.csv')

#### Final evaluation (only on documentation as a feature)
NDCG Within: 0.259<br>
NDCG All: 0.171

### Aside: preprocess func_name

#### EDA

In [41]:
test = train_py['func_name'].str.split('.')

In [53]:
test

0         [ImageGraphCut, __msgc_step3_discontinuity_loc...
1                [ImageGraphCut, __multiscale_gc_lo2hi_run]
2                [ImageGraphCut, __multiscale_gc_hi2lo_run]
3              [ImageGraphCut, __ordered_values_by_indexes]
4               [ImageGraphCut, __hi2lo_multiscale_indexes]
                                ...                        
412173                                   [Class, instances]
412174                                  [Class, subclasses]
412175                                [Class, superclasses]
412176                            [Class, message_handlers]
412177                                    [Class, undefine]
Name: func_name, Length: 412178, dtype: object

In [47]:
test[test.str.len() == 1]

16                                       [resize_to_shape]
17                                             [seed_zoom]
18                                         [zoom_to_shape]
19                                                  [crop]
20                                         [combinecrinfo]
                                ...                       
412123                             [matrixToMathTransform]
412124                             [mathTransformToMatrix]
412125               [_linearInterpolationTransformMatrix]
412126        [_polarDecomposeInterpolationTransformation]
412127    [_mathPolarDecomposeInterpolationTransformation]
Name: func_name, Length: 156852, dtype: object

In [48]:
test[test.str.len() == 2]

0         [ImageGraphCut, __msgc_step3_discontinuity_loc...
1                [ImageGraphCut, __multiscale_gc_lo2hi_run]
2                [ImageGraphCut, __multiscale_gc_hi2lo_run]
3              [ImageGraphCut, __ordered_values_by_indexes]
4               [ImageGraphCut, __hi2lo_multiscale_indexes]
                                ...                        
412173                                   [Class, instances]
412174                                  [Class, subclasses]
412175                                [Class, superclasses]
412176                            [Class, message_handlers]
412177                                    [Class, undefine]
Name: func_name, Length: 255324, dtype: object

In [57]:
test[(test.str.len() != 1) & (test.str.len() != 2)]

332897    NaN
396027    NaN
Name: func_name, dtype: object

In [59]:
train_py.loc[332897, 'func_name']

nan

In [90]:
train_py.loc[396027, 'func_name']

nan

#### Preprocess

In [61]:
split_func_name = train_py['func_name'].str.split('.')
split_func_name.head()

0    [ImageGraphCut, __msgc_step3_discontinuity_loc...
1           [ImageGraphCut, __multiscale_gc_lo2hi_run]
2           [ImageGraphCut, __multiscale_gc_hi2lo_run]
3         [ImageGraphCut, __ordered_values_by_indexes]
4          [ImageGraphCut, __hi2lo_multiscale_indexes]
Name: func_name, dtype: object

In [99]:
def camel_case_to_underscores(func_name):
    try:
        if len(func_name) == 1:
            return snakecase.convert(func_name[0])
        elif len(func_name) == 2:
            return snakecase.convert(func_name[1])
    except:
        return ''

In [109]:
preprocess_camel_case = split_func_name.apply(camel_case_to_underscores)
preprocess_underscores = preprocess_camel_case.str.replace('_', ' ').str.strip()

In [110]:
preprocess_underscores.head()

0    msgc step3 discontinuity localization
1                  multiscale gc lo2hi run
2                  multiscale gc hi2lo run
3                ordered values by indexes
4                 hi2lo multiscale indexes
Name: func_name, dtype: object

In [116]:
preprocess_doc = preprocess_underscores + ' ' + train_py['func_documentation_string']

In [118]:
# preprocess_doc_emb = model.encode(preprocess_doc, convert_to_tensor=True)

In [119]:
# pickle preprocess_doc_emb
# with open('preprocess_func_name.npy', 'wb') as f:
#     np.save(f, preprocess_doc_emb.to('cpu').numpy())

In [120]:
preprocess_doc_emb = np.load('preprocess_func_name.npy')

In [121]:
preprocess_doc_emb

array([[ 0.0166563 , -0.03803704,  0.05928445, ..., -0.10417175,
        -0.08774463, -0.02464951],
       [-0.00755583,  0.01133708,  0.04043078, ..., -0.04562359,
        -0.10486861, -0.04952933],
       [ 0.00418711,  0.02655804,  0.02180671, ..., -0.07021855,
        -0.0976972 , -0.0497792 ],
       ...,
       [-0.11239953,  0.02397031,  0.01646855, ...,  0.06840994,
         0.02486667,  0.03715625],
       [ 0.03864758,  0.01578593,  0.03447834, ...,  0.0431488 ,
         0.01029573, -0.00725849],
       [-0.07235931,  0.05771292, -0.02917554, ..., -0.00428305,
         0.04437316,  0.0254098 ]], dtype=float32)

In [122]:
preprocess_results = to_evaluate(preprocess_doc, preprocess_doc_emb)

In [124]:
preprocess_results.head()

Unnamed: 0,language,query,url
0,Python,convert int to string,https://github.com/Gandi/gandi.cli/blob/6ee5b8...
1,Python,convert int to string,https://github.com/rootpy/rootpy/blob/3926935e...
2,Python,convert int to string,https://github.com/Yelp/kafka-utils/blob/cdb4d...
3,Python,convert int to string,https://github.com/gabrielfalcao/dominic/blob/...
4,Python,convert int to string,https://github.com/mdsol/rwslib/blob/1a86bc072...


In [125]:
# preprocess_results.to_csv('preprocess_eval_results.csv')

#### Final evaluation (only on documentation as a feature)
NDCG Within: 0.225<br>
NDCG All: 0.170<br>
* lower :,( than if not preprocessed