In [2]:
from spellchecker import SpellChecker
import ir_datasets
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter
from num2words import num2words
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import unicodedata
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List


# %load_ext autotime
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")

In [2]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data
 
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize(sentence):
    lemmatizer = WordNetLemmatizer()
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(word_tokenize(str(sentence)))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def remove_urls(data):
    cleaned_text = re.sub(r'/(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(\.[a-zA-Z]{2,})(\.[a-zA-Z]{2,})?\/[a-zA-Z0-9]{2,}|((https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(\.[a-zA-Z]{2,})(\.[a-zA-Z]{2,})?)|(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}(\.[a-zA-Z0-9]{2,})?/g', '', data)
    return cleaned_text;

def replace_contractions(data):
    return " ".join(contractions.fix(data))

def correct_sentence_spelling(data):
    tokens = word_tokenize(str(data))
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    for i, token in enumerate(tokens):
        if token in misspelled:
            corrected = spell.correction(token)
            if corrected is not None:
                tokens[i] = corrected
    return " ".join( tokens)

def custom_tokenizer(text: str) -> List[str]:
    tokens = word_tokenize(text.lower())
    return tokens

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) 
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) 
    data = remove_punctuation(data)
    data = remove_stop_words(data) 
    return data


In [4]:
counter=0
corpus = {}
for doc in dataset.docs_iter():
    corpus[doc.doc_id] = doc.condition

documents = list(corpus.values())
documents

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',


In [4]:
# Vectorizer setup
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, preprocessor=preprocess)
tfidf_matrix = vectorizer.fit_transform(documents)
df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vectorizer.get_feature_names_out(), index=corpus.keys())
tfidf_model = vectorizer





In [5]:

# Save and load functions for TF-IDF data
def save_file(file_location: str, content):
    if os.path.exists(file_location):
        os.remove(file_location)
    with open(file_location, 'wb') as handle:
        pickle.dump(content, handle, protocol=pickle.HIGHEST_PROTOCOL)



def load_file(file_location: str):
    with open(file_location, 'rb') as handle:
        content = pickle.load(handle)
    return content




def save_tfidf_data(tfidf_matrix, tfidf_model):
    save_file(os.path.join("D:\ir-search-engine\storage", f"clinicaltrials_tfidf_matrix.pickle"), tfidf_matrix)
    save_file(os.path.join("D:\ir-search-engine\storage", f"clinicaltrials_tfidf_model.pickle"), tfidf_model)


save_tfidf_data(tfidf_matrix, tfidf_model)



def process_query(query: str, tfidf_model, tfidf_matrix):
    query_tfidf = tfidf_model.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    ranked_doc_indices = cosine_similarities.argsort()[::-1]
    return ranked_doc_indices, cosine_similarities



tfidf_matrix = load_file("D:\ir-search-engine\storage\\clinicaltrials_tfidf_matrix.pickle")
tfidf_model = load_file("D:\ir-search-engine\storage\\clinicaltrials_tfidf_model.pickle")



def getRetrievedQueries(query: str, k=10):
    preprocessed_query = preprocess(query)
    ranked_indices, _ = process_query(preprocessed_query, tfidf_model, tfidf_matrix)
    idsList = []
    for idx in ranked_indices[:k]:
        doc_id = list(corpus.keys())[idx]
        idsList.append(doc_id)
    return idsList




def calculate_recall_precision(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break  

    y_true = [1 if doc_id in relevant_docs else 0 for doc_id in retrieved_docs]
    true_positives = sum(y_true)
    recall_at_10 = true_positives / len(relevant_docs) if relevant_docs else 0
    precision_at_10 = true_positives / 10
    print(f"Query ID:  {query_id}, Recall@10: {recall_at_10}")
    print(f"Query ID: {query_id}, Precision@10: {precision_at_10}")    
    return recall_at_10



queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}



for query_id in list(queries_ids.keys()):
    calculate_recall_precision(query_id)



def calculate_MAP(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break

    pk_sum = 0

    total_relevant = 0

    for i in range(1, 11):
        relevant_ret = 0

        for j in range(i):
            if j < len(retrieved_docs) and retrieved_docs[j] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs else 0)

        pk_sum += p_at_k

        if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant


queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

map_sum = 0

for query_id in list(queries_ids.keys()):
    map_sum += calculate_MAP(query_id)



print(f"Mean Average Precision (MAP@10): {map_sum / len(queries_ids)}")



Query ID:  1, Recall@10: 0.05917159763313609
Query ID: 1, Precision@10: 1.0
Query ID:  2, Recall@10: 0.02962962962962963
Query ID: 2, Precision@10: 0.8
Query ID:  3, Recall@10: 0.03571428571428571
Query ID: 3, Precision@10: 0.3
Query ID:  4, Recall@10: 0.042105263157894736
Query ID: 4, Precision@10: 0.4
Query ID:  5, Recall@10: 0.03980099502487562
Query ID: 5, Precision@10: 0.8
Query ID:  6, Recall@10: 0.0380952380952381
Query ID: 6, Precision@10: 0.8
Query ID:  7, Recall@10: 0.043478260869565216
Query ID: 7, Precision@10: 0.7
Query ID:  8, Recall@10: 0.0457516339869281
Query ID: 8, Precision@10: 0.7
Query ID:  9, Recall@10: 0.037383177570093455
Query ID: 9, Precision@10: 0.8
Query ID:  10, Recall@10: 0.0
Query ID: 10, Precision@10: 0.0
Query ID:  11, Recall@10: 0.07142857142857142
Query ID: 11, Precision@10: 0.9
Query ID:  12, Recall@10: 0.04697986577181208
Query ID: 12, Precision@10: 0.7
Query ID:  13, Recall@10: 0.028985507246376812
Query ID: 13, Precision@10: 0.4
Query ID:  14, Rec

In [6]:
def calculate_MRR(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break

    for i in range(1, 11):
        if retrieved_docs[i-1] in relevant_docs:
            return 1 / i
      

    return 0


queries_list = list(queries_ids.keys())
mrr_sum = 0
for query_id in queries_list:
    mrr_sum += calculate_MRR(query_id)
print(f"Mean Reciprocal Rank (MRR): {(1 / len(queries_list)) * mrr_sum}")


Mean Reciprocal Rank (MRR): 0.6773650793650796
