In [1]:
import ir_datasets
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import pandas as pd
import pickle
import os
import re
from typing import List
import joblib
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [2]:

# Load the dataset
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")

# Create a corpus from the dataset
corpus = {}


for doc in dataset.docs_iter():
        corpus[doc.doc_id]= doc.title+" " + doc.summary+" " +doc.detailed_description+ " "+doc.eligibility


        
documents = list(corpus.values())

def custom_tokenizer(text: str) -> List[str]:
    """Tokenizes and lowercases the text."""
    tokens = word_tokenize(text.lower())
    return tokens

def get_wordnet_pos(tag):
    """Converts POS tag to a format that WordNetLemmatizer can understand."""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def remove_markers(tokens: List[str]) -> List[str]:
    """Removes specific markers from tokens."""
    return [re.sub(r'\u00AE', '', token) for token in tokens]

def remove_punctuation(tokens: List[str]) -> List[str]:
    """Removes punctuation from tokens."""
    return [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

def replace_under_score_with_space(tokens: List[str]) -> List[str]:
    """Replaces underscores with spaces in tokens."""
    return [re.sub(r'_', ' ', token) for token in tokens]

def remove_apostrophe(tokens: List[str]) -> List[str]:
    """Removes apostrophes from tokens."""
    return [token.replace("'", " ") for token in tokens]

def preprocess_text(text: str) -> str:
    """Preprocesses the input text by tokenizing, removing punctuation, stopwords, and then stemming and lemmatizing."""
    # Convert text to lowercase and tokenize
    words = custom_tokenizer(text)
    
    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
  
    # Further token cleaning
    words = remove_markers(words)
    words = replace_under_score_with_space(words)
    words = remove_apostrophe(words)
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return ' '.join(words)


In [3]:

# Preprocess documents
processed_documents = [preprocess_text(doc) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec([doc.split() for doc in processed_documents], vector_size=100, sg=1, epochs=37)
# Save the Word2Vec model
word2vec_model.save("word2vec_model.kv")
# Load the Word2Vec model

word2vec_model = Word2Vec.load("word2vec_model.kv")






In [4]:

def vectorize_documents(documents: List[str]) -> List[np.ndarray]:
    document_vectors = []
    for document in documents:
        vectors = [word2vec_model.wv[token] for token in document.split() if token in word2vec_model.wv]
        if vectors:
            document_vectors.append(np.mean(vectors, axis=0))
        else:
            document_vectors.append(np.zeros(100))
    return document_vectors

# Compute document vectors
doc_vectors = vectorize_documents(processed_documents)
print(doc_vectors)
# Save and load functions for TF-IDF data
def save_file(file_location: str, content):
    with open(file_location, 'wb') as file:
        pickle.dump(content, file, protocol=pickle.HIGHEST_PROTOCOL)

def load_file(file_location: str):
    with open(file_location, 'rb') as file:
        loaded_file = pickle.load(file)
    return loaded_file
save_file("doc_vectors.pkl",doc_vectors)
doc_vectors = load_file("doc_vectors.pkl")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [5]:
def compute_relevance_scores(query_text: str) -> List[str]:
    """Compute relevance scores between a query vector and all document vectors."""
    query_tokens = preprocess_text(query_text)
    query_vec = vectorize_documents([query_tokens])[0].reshape(1, -1)
    similarities = cosine_similarity(doc_vectors, query_vec)
    top_10_indices = similarities.argsort(axis=0)[-10:][::-1].flatten()
    return [list(corpus.keys())[index] for index in top_10_indices]

In [6]:
def calculate_recall_precision(query_id):
    relevant_docs = []
    retrieved_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = compute_relevance_scores(query[1])
            break

    truncated_retrieved_docs = retrieved_docs[:len(relevant_docs)]
    y_true = [1 if doc in relevant_docs else 0 for doc in retrieved_docs]
    true_positives = sum(1 for doc in truncated_retrieved_docs if doc in relevant_docs)
    recall_at_10 = true_positives / len(relevant_docs)
    precision_at_10 = true_positives / 10
    print(f"query id: {query_id}, Recall@10: {recall_at_10}")
    print(f"query id: {query_id}, Precision@10: {precision_at_10}")

    return recall_at_10

queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

for query_id in queries_ids.keys():
    calculate_recall_precision(query_id)


query id: 1, Recall@10: 0.011834319526627219
query id: 1, Precision@10: 0.2
query id: 2, Recall@10: 0.025925925925925925
query id: 2, Precision@10: 0.7
query id: 3, Recall@10: 0.011904761904761904
query id: 3, Precision@10: 0.1
query id: 4, Recall@10: 0.0
query id: 4, Precision@10: 0.0
query id: 5, Recall@10: 0.01990049751243781
query id: 5, Precision@10: 0.4
query id: 6, Recall@10: 0.01904761904761905
query id: 6, Precision@10: 0.4
query id: 7, Recall@10: 0.031055900621118012
query id: 7, Precision@10: 0.5
query id: 8, Recall@10: 0.032679738562091505
query id: 8, Precision@10: 0.5
query id: 9, Recall@10: 0.014018691588785047
query id: 9, Precision@10: 0.3
query id: 10, Recall@10: 0.0
query id: 10, Precision@10: 0.0
query id: 11, Recall@10: 0.007936507936507936
query id: 11, Precision@10: 0.1
query id: 12, Recall@10: 0.020134228187919462
query id: 12, Precision@10: 0.3
query id: 13, Recall@10: 0.0
query id: 13, Precision@10: 0.0
query id: 14, Recall@10: 0.01020408163265306
query id: 14

In [7]:


def calculate_MAP(query_id):
    relevant_docs = [qrel[1] for qrel in dataset.qrels_iter() if qrel[0] == query_id]
    relevant_docs = []

    for query in dataset.queries_iter():
        if query[0] == query_id:
            relevant_docs = compute_relevance_scores(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = sum(1 for j in range(i) if j < len(relevant_docs) and relevant_docs[j] in relevant_docs)
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(relevant_docs) and relevant_docs[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(relevant_docs) and relevant_docs[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

map_sum = sum(calculate_MAP(query_id) for query_id in queries_ids.keys())
print(map_sum / dataset.queries_count())

1.0


In [8]:
def calculate_MRR(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel.query_id == query_id and qrel.relevance > 0:
            relevant_docs.append(qrel.doc_id)
    
    retrieved_docs = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            retrieved_docs = compute_relevance_scores(query.text)
            break

    for i, result in enumerate(retrieved_docs):
        if result in relevant_docs:
            return 1 / (i + 1)

    return 0

queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel.query_id: ''})

mrr_sum = 0
for query_id in list(queries_ids.keys()):
    mrr_sum += calculate_MRR(query_id)

print(f"Mean Reciprocal Rank : {mrr_sum / len(queries_ids)}")

Mean Reciprocal Rank : 0.4187407407407407


In [9]:
def calculate_MAP(query_id):
    relevant_docs = [qrel[1] for qrel in dataset.qrels_iter() if qrel[0] == query_id]
    ordered_results = []

    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results = compute_relevance_scores(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = sum(1 for j in range(i) if j < len(ordered_results) and ordered_results[j] in relevant_docs)
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(ordered_results) and ordered_results[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(ordered_results) and ordered_results[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

map_sum = sum(calculate_MAP(query_id) for query_id in queries_ids.keys())
print(map_sum / dataset.queries_count())

0.5324335159989921
