## BM25 on Documentation

https://www.analyticsvidhya.com/blog/2021/05/build-your-own-nlp-based-search-engine-using-bm25/

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
import torch
import snakecase
from rank_bm25 import BM25Okapi

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Import data

In [3]:
# train and test data
train_py = pd.read_csv('python_train_dataset.csv')
test_py = pd.read_csv('python_test_dataset.csv')

# ground truth data
labeled_dataset = pd.read_csv('annotationStore.csv')
labeled_py = labeled_dataset[labeled_dataset['Language'] == 'Python']

# 99 queries
queries = pd.read_csv('queries.csv')

# only contains labeled data for python that exists in train_py
merged_py = labeled_py.merge(train_py, left_on='GitHubUrl', right_on='func_code_url')

# func name + documentation string (func name not yet preprocessed)
doc = train_py['func_name'] + ' ' + train_py['func_documentation_string']
doc_emb = np.load('func_name_docu_doc_emb.npy')

### Documentation as a feature

In [4]:
docs = (train_py['func_name'] + ' ' + train_py['func_documentation_string']).str.lower()

In [5]:
docs.head()

0    imagegraphcut.__msgc_step3_discontinuity_local...
1    imagegraphcut.__multiscale_gc_lo2hi_run run gr...
2    imagegraphcut.__multiscale_gc_hi2lo_run run gr...
3    imagegraphcut.__ordered_values_by_indexes retu...
4    imagegraphcut.__hi2lo_multiscale_indexes funct...
dtype: object

#### Remove special characters

In [6]:
def remove_special_chars(doc):
    try:
        doc = re.sub('[^0-9a-zA-Z]', ' ', doc)
        doc = re.sub('\s+', ' ', doc)
        return doc
    except:
        return ''

In [7]:
# preprocessed_docs = docs.apply(remove_special_chars).str.split()

In [8]:
# preprocessed_docs.to_csv('preprocessed_docs.csv')

In [None]:
preprocessed_docs = pd.read_csv('preprocessed_docs.csv')

#### Remove stopwords

In [9]:
nltk_stopwords = set(stopwords.words('english'))

In [10]:
def remove_stopwords(doc):
    return [word for word in doc if word.lower() not in nltk_stopwords]

In [11]:
# preprocessed_docs2 = preprocessed_docs.apply(remove_stopwords)

In [12]:
# preprocessed_docs2.to_csv('preprocessed_docs2.csv')

In [None]:
preprocessed_docs2 = pd.read_csv('preprocessed_docs2.csv')

#### Stemming

In [14]:
def stemming(doc):
    ps = PorterStemmer()
    return [ps.stem(word) for word in doc]

In [15]:
# preprocessed_docs3 = preprocessed_docs2.apply(stemming)

In [16]:
# preprocessed_docs3.to_csv('preprocessed_docs3.csv')

In [None]:
preprocessed_docs3 = pd.read_csv('preprocessed_docs3.csv')

### Run search

In [17]:
def return_relevant_docs(query, preprocessed_docs, tokenized_docs, k=10):
    tokenized_query = query.lower().split()
    
    bm25 = BM25Okapi(tokenized_docs)
    results = bm25.get_top_n(tokenized_query, train_py['func_code_url'], n=k)
    
    scores = bm25.get_scores(tokenized_query)
    top_k_scores = sorted(scores, reverse=True)[:k]
    
    sim_scores, ranks, urls = [], [], []
    for i, res in enumerate(results):
        sim_scores.append(top_k_scores[i])
        ranks.append(i+1)
        urls.append(res)
        
    return pd.DataFrame({'score': sim_scores, 'rank': ranks, 'url': urls})

In [18]:
# test_query = 'convert int to string'
# return_relevant_docs(test_query, preprocessed_docs, tokenized_docs)

In [19]:
def model_predictions(preprocessed_docs, tokenized_docs, k=10):
    df_queries, df_urls = [], []
    for query in queries['query'].values:
        results = return_relevant_docs(query, preprocessed_docs, tokenized_docs, k=k)
        
        df_queries += [query] * k
        df_urls += list(results['url'].values)
        
    return pd.DataFrame({'language': ['Python'] * len(df_queries), 'query': df_queries, 'url': df_urls})

#### No preprocessing

In [7]:
docus = docs.fillna('')
tokenized_docus = docus.apply(lambda x: x.split()).to_numpy()

In [8]:
# no_preprocess_results = model_predictions(docus, tokenized_docus)

In [9]:
# no_preprocess_results.to_csv('bm25_no_preprocess_results.csv')

In [13]:
!python relevanceeval.py annotation_store_py.csv bm25_no_preprocess_results.csv

% of URLs in predictions that exist in the annotation dataset:
	python: 12.45%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
	python: 13.44%
NDCG:
	python: 0.182
NDCG (full ranking):
	python: 0.106


#### Preprocessing: removal of special chars only

In [8]:
tokenized_docs = preprocessed_docs.apply(lambda x: x.split()).to_numpy()

In [42]:
# results = model_predictions(preprocessed_docs, tokenized_docs)

In [44]:
# results.to_csv('bm25_results.csv')

In [45]:
!python relevanceeval.py annotation_store_py.csv bm25_results.csv

% of URLs in predictions that exist in the annotation dataset:
	python: 17.81%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
	python: 18.87%
NDCG:
	python: 0.257
NDCG (full ranking):
	python: 0.173


#### Preprocessing: removal of special chars and stopwords

In [31]:
tokenized_docs2 = preprocessed_docs2.to_numpy()

In [32]:
# results2 = model_predictions(preprocessed_docs2, tokenized_docs2)

In [33]:
# results2.to_csv('bm25_results2.csv')

In [34]:
!python relevanceeval.py annotation_store_py.csv bm25_results2.csv

% of URLs in predictions that exist in the annotation dataset:
	python: 18.67%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
	python: 19.81%
NDCG:
	python: 0.271
NDCG (full ranking):
	python: 0.185


#### Preprocessing: removal of special chars and stopwords, include stemming

In [20]:
tokenized_docs3 = preprocessed_docs3.to_numpy()

In [21]:
# results3 = model_predictions(preprocessed_docs3, tokenized_docs3)

In [22]:
# results3.to_csv('bm25_results3.csv')

In [23]:
!python relevanceeval.py annotation_store_py.csv bm25_results3.csv

% of URLs in predictions that exist in the annotation dataset:
	python: 9.01%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
	python: 9.20%
NDCG:
	python: 0.133
NDCG (full ranking):
	python: 0.084
