---
# Importing packages/dependencies

In [1]:
import math
import pandas as pd
import numpy as np
import string
from sklearn.metrics import ndcg_score

# nltk related
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

---
# Data Files
For code to work in colab add the following files from the dataset to a folder named 'data' inside colab's 'sample_data' folder:

- doc_dump.txt
- stopwords.large
- train.3-2-1.qrel
- train.all.queries

In [2]:
!pwd # (print working directory to check path ipynb file is sitting in)

/content


## all four files from original dataset (insert reference):


### Loading document summaries

In [3]:
# doc_dump.txt

filename = '/content/sample_data/data/doc_dump.txt'
#pd.read_csv(filename, sep='\t', names=None)
doc = pd.read_csv(filename, sep = '\t', header=None)
doc.columns =['ID', 'URL', 'TITLE', 'ABSTRACT']
doc.head(1)

Unnamed: 0,ID,URL,TITLE,ABSTRACT
0,MED-1,http://www.ncbi.nlm.nih.gov/pubmed/23092936,"Birth Weight, Head Circumference, and Prenatal...",Abstract Background: Acrylamide is a common di...


### Loading Stopwords

In [4]:
# stopwords.large

filename = '/content/sample_data/data/stopwords.large'
#pd.read_csv(filename, sep='\t', names=None)
stopwords_given = pd.read_csv(filename, sep = '\t', header=None)
stopwords_given.columns =['STOP_WORDS']
stopwords_given_list = stopwords_given['STOP_WORDS'].tolist()
stopwords_given_list[0:10]

['RT',
 'USER',
 'URL',
 'a',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across']

### Loading Relevance Judgements from dataset

{train}.3-2-1.qrel  --      The relevance file for 4 levels in total: 

*   Direct links (3)
*   Indirect links (2)
*   Marginally relevant(1)
*   Others (0, not in the files)

In [5]:
# loading the relevance judgements

# train.3-2-1.qrel

filename = '/content/sample_data/data/train.3-2-1.qrel'
query_relevance = pd.read_csv(filename, sep = '\t', header=None)
query_relevance.columns =['QUERY_ID', '0', 'DOC_ID', 'RELEVANCE_LEVEL']

In [6]:
query_relevance.sample(3)

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
133193,PLAIN-3235,0,MED-5322,1
20359,PLAIN-676,0,MED-4535,2
21628,PLAIN-704,0,MED-3272,2


Filtering by Query ID of 'PLAIN-10' and Relevance Level of '3': 

In [7]:
query_relevance.loc[(query_relevance.RELEVANCE_LEVEL == 3) & (query_relevance.QUERY_ID == 'PLAIN-10')].sample(3)

Unnamed: 0,QUERY_ID,0,DOC_ID,RELEVANCE_LEVEL
374,PLAIN-10,0,MED-2493,3
372,PLAIN-10,0,MED-2495,3
375,PLAIN-10,0,MED-2497,3


### Loading all Queries from dataset

In [8]:
# train.all.queries

filename = '/content/sample_data/data/train.all.queries'
query = pd.read_csv(filename, sep = '\t', header=None)
query.columns =['QUERY_ID', 'QUERY']
query.head(1)

Unnamed: 0,QUERY_ID,QUERY
0,PLAIN-10,how contaminated are our children ? in a study...


In [9]:
query.loc[query.QUERY_ID == 'PLAIN-120']

Unnamed: 0,QUERY_ID,QUERY
164,PLAIN-120,preloading with watercress before exercise cou...


---
# Joining doc titles to doc abstracts

In [10]:
doc['TITLE_ABSTRACT'] = doc[['TITLE', 'ABSTRACT']].apply(lambda x: ' '.join(x), axis=1)
doc.head(1)

Unnamed: 0,ID,URL,TITLE,ABSTRACT,TITLE_ABSTRACT
0,MED-1,http://www.ncbi.nlm.nih.gov/pubmed/23092936,"Birth Weight, Head Circumference, and Prenatal...",Abstract Background: Acrylamide is a common di...,"Birth Weight, Head Circumference, and Prenatal..."


In [11]:
# quick example of end product
doc['TITLE_ABSTRACT'][9][0:150]

'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland Abstract Recent studies have suggested that statins, an established drug'

---
# Query Processing Function

Adjustments to queries are:
- Lowercased
- Tokenized
- Punctuation Removed
- Stopwords Removed
- Frequent Words Removed

In [12]:
from itertools import chain
final_list = list(chain.from_iterable([[i.split('\t', 1)[0] for i in string.punctuation], stopwords_given_list]))
final_list += ['abstract', 'background'] # adding 'abstract' & 'background' to final_list
def processing(example, final_list):
  example = example.lower()
  example = word_tokenize(example)
  clear = []
  for item in example:
    if item not in final_list:
      clear.append(item)
  return clear

In [13]:
# quick example check
q_ex = "Abstract My Eyeballs are exploding & BLEEDING, Background feels like demons!!!"
processed = processing(q_ex, final_list)
processed

['eyeballs', 'exploding', 'bleeding', 'feels', 'demons']

---
# Processing applied to docs file:

In [14]:
# applying processing function to combined titles and abstracts of docs, and adding as separate column:
doc['PROCESSED_TITLE_ABSTRACT'] = doc['TITLE_ABSTRACT'].apply(processing, final_list=final_list)

In [15]:
doc.head(3)

Unnamed: 0,ID,URL,TITLE,ABSTRACT,TITLE_ABSTRACT,PROCESSED_TITLE_ABSTRACT
0,MED-1,http://www.ncbi.nlm.nih.gov/pubmed/23092936,"Birth Weight, Head Circumference, and Prenatal...",Abstract Background: Acrylamide is a common di...,"Birth Weight, Head Circumference, and Prenatal...","[birth, weight, head, circumference, prenatal,..."
1,MED-2,http://www.ncbi.nlm.nih.gov/pubmed/22809476,A statistical regression model for the estimat...,Abstract Human exposure to acrylamide (AA) thr...,A statistical regression model for the estimat...,"[statistical, regression, model, estimation, a..."
2,MED-3,http://www.ncbi.nlm.nih.gov/pubmed/19158207,Chronic intake of potato chips in humans incre...,Abstract BACKGROUND: Relatively high concentra...,Chronic intake of potato chips in humans incre...,"[chronic, intake, potato, chips, humans, incre..."


---
# Processing applied to queries

In [16]:
query['PROCESSED_QUERY'] = query['QUERY'].apply(processing, final_list=final_list)

In [17]:
query.sample(3)

Unnamed: 0,QUERY_ID,QUERY,PROCESSED_QUERY
2133,PLAIN-444,do pine needles from a douglas fir or redwood ...,"[pine, needles, douglas, fir, redwood, provide..."
2318,PLAIN-672,"benzodiazepines - - medications , side effects...","[benzodiazepines, medications, side, effects, ..."
2240,PLAIN-576,"antacids - - meat , medications , kidney failu...","[antacids, meat, medications, kidney, failure,..."


In [18]:
query.loc[query.QUERY_ID == 'PLAIN-120']

Unnamed: 0,QUERY_ID,QUERY,PROCESSED_QUERY
164,PLAIN-120,preloading with watercress before exercise cou...,"[preloading, watercress, exercise, exercise, c..."


---
---
# Omair's original query processing code (no functions)

## 1. Lowercased:

In [19]:
# doc['TITLE_ABSTRACT'] = doc['TITLE_ABSTRACT'].apply(lambda s:s.lower() if type(s) == str else s)
# doc.head(1)

## 2. Tokenization: 
Splitting the sentence into words. So that each word can be considered uniquely.

In [20]:
# doc['TOKENIZED_TEST'] = doc['TITLE_ABSTRACT'].apply(word_tokenize)
# doc['TOKENIZED_TEST']

##3. Cleaning
Cleaning step assumes removing all undesirable content.

### 3a. Punctuation removal

In [21]:
# doc['TOKENIZED_TEST'] = doc['TOKENIZED_TEST'].apply(lambda x: [item for item in x if item not in string.punctuation])
# display(f"Punctuation symbols: {string.punctuation}")
# doc.head(1)

### 3b. Stop words removal

In [22]:
# stop = set(stopwords.words('english'))
# display(f"Punctuation symbols: {stop}")

In [23]:
# doc['TEXT_WITHOUT_STOPWORDS'] = doc['TOKENIZED_TEST'].apply(lambda x: [item for item in x if item not in stop])
# doc.head(1)

### 3c. Frequent words removal

In [24]:
# doc['TEXT_WITHOUT_STOPWORDS'] = doc['TEXT_WITHOUT_STOPWORDS'].apply(lambda x: [item for item in x if item not in stopwords_given_list])
# frequent_words= {'abstract','background'}
# doc['TEXT_WITHOUT_STOPWORDS'] = doc['TEXT_WITHOUT_STOPWORDS'].apply(lambda x: [item for item in x if item not in frequent_words])
# doc.head(3)

In [25]:
# # first 30 words of first paper title and abstract
# doc['TEXT_WITHOUT_STOPWORDS'][0][0:20]

In [26]:
# # Omair's version

# # Applying the text processing to the 'query' file 

# query['QUERY'] = query['QUERY'].apply(lambda s:s.lower() if type(s) == str else s)
# query['TOKENIZED_TEST'] = query['QUERY'].apply(word_tokenize) 
# query['TOKENIZED_TEST'] = query['TOKENIZED_TEST'].apply(lambda x: [item for item in x if item not in string.punctuation])
# query['TEXT_WITHOUT_STOPWORDS'] = query['TOKENIZED_TEST'].apply(lambda x: [item for item in x if item not in stop])
# query['TEXT_WITHOUT_STOPWORDS'] = query['TEXT_WITHOUT_STOPWORDS'].apply(lambda x: [item for item in x if item not in stopwords_given_list])

---
---
# **Creating** Class for BM25 </br>
(code from lab - bm25_intro, 2021)

In [27]:
class BM25:
    """
    Best Match 25.

    Parameters
    ----------
    k1 : float, default 1.5

    b : float, default 0.75

    Attributes
    ----------
    tf_ : list[dict[str, int]]
        Term Frequency per document. So [{'hi': 1}] means
        the first document contains the term 'hi' 1 time.

    df_ : dict[str, int]
        Document Frequency per term. i.e. Number of documents in the
        corpus that contains the term.

    idf_ : dict[str, float]
        Inverse Document Frequency per term.

    doc_len_ : list[int]
        Number of terms per document. So [3] means the first
        document contains 3 terms.

    corpus_ : list[list[str]]
        The input corpus.

    corpus_size_ : int
        Number of documents in the corpus.

    avg_doc_len_ : float
        Average number of terms for documents in the corpus.
    """

    def __init__(self, k1=1.2, b=0.75):
        self.b = b
        self.k1 = k1

    def fit(self, corpus):
        """
        Fit the various statistics that are required to calculate BM25 ranking
        score using the corpus given.

        Parameters
        ----------
        corpus : list[list[str]]
            Each element in the list represents a document, and each document
            is a list of the terms.

        Returns
        -------
        self
        """
        tf = []
        df = {}
        idf = {}
        doc_len = []
        corpus_size = 0
        for document in corpus:
            corpus_size += 1
            doc_len.append(len(document))

            # compute tf (term frequency) per document
            frequencies = {}
            for term in document:
                term_count = frequencies.get(term, 0) + 1
                frequencies[term] = term_count

            tf.append(frequencies)

            # compute df (document frequency) per term
            for term, _ in frequencies.items():
                df_count = df.get(term, 0) + 1
                df[term] = df_count

        for term, freq in df.items():
            idf[term] = math.log(1 + (corpus_size - freq + 0.5) / (freq + 0.5))

        self.tf_ = tf
        self.df_ = df
        self.idf_ = idf
        self.doc_len_ = doc_len
        self.corpus_ = corpus
        self.corpus_size_ = corpus_size
        self.avg_doc_len_ = sum(doc_len) / corpus_size
        return self

    def search(self, query):
        scores = [self._score(query, index) for index in range(self.corpus_size_)]
        return scores

    def _score(self, query, index):
        score = 0.0

        doc_len = self.doc_len_[index]
        frequencies = self.tf_[index]
        for term in query:
            if term not in frequencies:
                continue

            freq = frequencies[term]
            numerator = self.idf_[term] * freq * (self.k1 + 1)
            denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_len_)
            score += (numerator / denominator)

        return score

In [28]:
bm25 = BM25()
bm25.fit(doc['PROCESSED_TITLE_ABSTRACT'])
scores = bm25.search(query['PROCESSED_QUERY'][0])

document_list = doc['ID'].tolist()
list_of_doc_score = list(zip(scores, document_list))
bm_rank = pd.DataFrame(list_of_doc_score, columns = ['Score', 'Doc_ID'])
bm_rank = bm_rank.sort_values(by="Score", ascending=False)
bm_rank.head(3)
#for score, document in zip(scores, document_list):
#    score = round(score, 3)
#    print(str(score) + '\t' + document)

Unnamed: 0,Score,Doc_ID
2493,806.98746,MED-2494
51,806.98746,MED-52
1158,806.98746,MED-1159


In [29]:
bm_rank.loc[bm_rank.Doc_ID == 'MED-2494']

Unnamed: 0,Score,Doc_ID
2493,806.98746,MED-2494


In [30]:
type(document_list)

list

In [31]:
query['QUERY_ID'][0]

'PLAIN-10'

In [32]:
len(doc['PROCESSED_TITLE_ABSTRACT'][0])

168

In [33]:
# # comment out to speed up running notebook as is slow to print every term:
# for document in doc['PROCESSED_TITLE_ABSTRACT']:
#   print(document)
#   for term in document:
#     print(term)

In [34]:
list_of_doc_score

[(228.00878947210632, 'MED-1'),
 (182.22133736296118, 'MED-2'),
 (121.17186903293947, 'MED-3'),
 (242.9447882723667, 'MED-4'),
 (141.64124820481354, 'MED-5'),
 (181.43917892894711, 'MED-6'),
 (236.77239119255614, 'MED-7'),
 (195.25041603639488, 'MED-8'),
 (86.97551268641074, 'MED-9'),
 (347.63273073158996, 'MED-10'),
 (102.88922916760755, 'MED-11'),
 (79.31713529210107, 'MED-12'),
 (97.79459797013637, 'MED-13'),
 (90.02518500213488, 'MED-14'),
 (226.07571701376514, 'MED-15'),
 (122.2656508762412, 'MED-16'),
 (183.17849925189935, 'MED-17'),
 (134.71821358727559, 'MED-18'),
 (467.4344902645315, 'MED-19'),
 (137.19070789883938, 'MED-20'),
 (119.70351510078663, 'MED-21'),
 (174.51151273334239, 'MED-22'),
 (81.37754249607364, 'MED-23'),
 (220.0339861623931, 'MED-24'),
 (171.56745524173934, 'MED-25'),
 (44.511942486860846, 'MED-26'),
 (263.3553240617706, 'MED-27'),
 (74.0839610698771, 'MED-28'),
 (176.31866296516915, 'MED-29'),
 (199.20087684256782, 'MED-30'),
 (270.59961930998156, 'MED-31')

---
--- 
# Sorting by Relevance
--- 

In [35]:
bm25 = BM25()
bm25.fit(doc['PROCESSED_TITLE_ABSTRACT'])

def retrieve_ranking(query, model, doc):
  scores = model.search(query)
  document_list = doc['ID'].tolist()
  bm_rank = sorted(zip(document_list, scores), key = lambda tup:tup[1], reverse=True)
  return bm_rank

In [36]:
# Calculate normalized dcg (ndcg) at k using 
# sklearn library: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html

# the things you need is y_true and y_target
# first one comes from qrels and second from document rankings
# The inputs are numpy arrays
# for (n_samples, n_labels) in the documentation, here n_labels = 1

def scoring(query, qrels, model, doc_ids, i, k=4):
  for query_id, q in query.items():
    # queries is q_id, q
    query_qrels = [qrel[1:] for qrel in qrels if qrel[0] == query_id] # get the right qrels
    # query_qrels is q_id, d_id, judg
    ranked_docs = [x[0] for x in query_qrels]
  
    true_ranking = sorted(query_qrels, key=lambda tup:tup[1], reverse=True) # y_true
    # need to add retrieved documents that did not appear in the qrels
    true_ranking += [(doc_id, 0) for doc_id in doc_ids if doc_id not in ranked_docs]

    doc_ranking_dict = dict(retrieve_ranking(q, model, doc))
    doc_ranking_sorted = [(x[0], doc_ranking_dict[x[0]]) for x in true_ranking] # y_score
    # need to add retrieved documents that did not appear in the qrels
    # doc_ranking_sorted = doc_ranking_sorted + [(k,v) for k,v in doc_ranking_dict.items() if k not in ranked_docs]

    ndcg = ndcg_score(
      np.array([[x[1] for x in true_ranking]]),
      np.array([[x[1] for x in doc_ranking_sorted]]),
      k)
    
    #print('retrieved for "{}" with NDCG@{} of {}'.format(query, k, ndcg))
    #print('retrieved for "{}" with NDCG@{} of {}'.format(i, k, ndcg))
    return ndcg

In [37]:
# Convert Pandas to List of Tuples in appropriate qrel format 
# [(0, 0, 0), (1, 1, 1)]

q_rel = query_relevance.drop('0', axis=1)
q_rel = q_rel.to_numpy()
q_rel_tup = list(map(tuple, q_rel))
q_rel_tup[0:5]

[('PLAIN-3', 'MED-2436', 3),
 ('PLAIN-3', 'MED-2437', 3),
 ('PLAIN-3', 'MED-2438', 3),
 ('PLAIN-3', 'MED-2439', 3),
 ('PLAIN-3', 'MED-2440', 3)]

In [38]:
# NDCG Scores

ndcg_scores = []

for i in range(1307):
  i = i
  q = {query['QUERY_ID'][i]: query['PROCESSED_QUERY'][i]}
  temp = scoring(q, q_rel_tup, bm25, doc['ID'], i, 4)
  ndcg_scores.append(temp)
  
def av(x):
    return sum(x) / len(x)

av(ndcg_scores)

0.2917813275215476

---
# Adding UI


In [39]:
# #@title Medical Research Paper Search Engine
# query = 'Heart Attack'  #@param {type:"string"}
# before_year = 2021 #@param {type:"number"}
# after_year = 1970 #@param {type:"number"}
# before_year_slider = 2021 #@param {type:"slider", min:1970, max:2021, step:1}
# after_year_slider = 1970 #@param {type:"slider", min:1970, max:2021, step:1}

# query_terms = query.split(' ')


# # term_search = [{
# #                     "term": {
# #                         "body": t
# #                     }
# #                 } for t in query_terms]

# query_body = {
#     "query":{
#         "bool":{
#             "should": [
#                 {
#                     "term": {
#                         "body": query
#                     }
#                 },
#                 {
#                     "range":{
#                         "year": {
#                             "gte": after_year,
#                             "lte": before_year
#                         }
                        
#                     }
#                 }
#             ]
#         }
#     }
# }
# # query_body['query']['bool']['should'].append(term_search)
# results, plain_results = search(index_name, query_body)
# for doc_id, title, year, score in plain_results:
#   print(doc_id, title, year, score)

---
# **References**

*   Ethen8181.github.io. 2021. bm25_intro. [online] Available at: <http://ethen8181.github.io/machine-learning/search/bm25_intro.html> [Accessed 18 July 2021].
*   List item

