In [4]:
import ir_datasets
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import pandas as pd
import pickle
import os
import re
from typing import List
import joblib
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [5]:

# Load the dataset
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")

# Create a corpus from the dataset
corpus = {}
counter =0

for doc in dataset.docs_iter():
    if counter<5000:
        corpus[doc.doc_id]= doc.title+" " + doc.summary+" " +doc.detailed_description+ " "+doc.eligibility
        counter +=1
    else:
        break


        
documents = list(corpus.values())

def custom_tokenizer(text: str) -> List[str]:
    """Tokenizes and lowercases the text."""
    tokens = word_tokenize(text.lower())
    return tokens

def get_wordnet_pos(tag):
    """Converts POS tag to a format that WordNetLemmatizer can understand."""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def remove_markers(tokens: List[str]) -> List[str]:
    """Removes specific markers from tokens."""
    return [re.sub(r'\u00AE', '', token) for token in tokens]

def remove_punctuation(tokens: List[str]) -> List[str]:
    """Removes punctuation from tokens."""
    return [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

def replace_under_score_with_space(tokens: List[str]) -> List[str]:
    """Replaces underscores with spaces in tokens."""
    return [re.sub(r'_', ' ', token) for token in tokens]

def remove_apostrophe(tokens: List[str]) -> List[str]:
    """Removes apostrophes from tokens."""
    return [token.replace("'", " ") for token in tokens]

def preprocess_text(text: str) -> str:
    """Preprocesses the input text by tokenizing, removing punctuation, stopwords, and then stemming and lemmatizing."""
    # Convert text to lowercase and tokenize
    words = custom_tokenizer(text)
    
    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
  
    # Further token cleaning
    words = remove_markers(words)
    words = replace_under_score_with_space(words)
    words = remove_apostrophe(words)
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return ' '.join(words)


In [None]:

# Preprocess documents
processed_documents = [preprocess_text(doc) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec(Word2Vec([doc.split() for doc in processed_documents], vector_size=1500, sg=1, epochs=50))
# Save the Word2Vec model
word2vec_model.save("word2vec_model.kv")
# Load the Word2Vec model

word2vec_model = Word2Vec.load("word2vec_model.kv")






In [None]:

def vectorize_documents(documents: List[List[str]]) -> np.ndarray:
    documents_vectors = []
    for document in documents:
        zero_vector = np.zeros(200)
        vectors = []
        for token in document:
            if token in word2vec_model.wv:
                vectors.append(word2vec_model.wv[token])
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            documents_vectors.append(avg_vec)
        else:
            documents_vectors.append(zero_vector)
    return np.array(documents_vectors)



# Compute document vectors
doc_vectors = vectorize_documents(processed_documents)
print(doc_vectors.shape)
print(doc_vectors)


# Save and load functions for TF-IDF data
def save_file(file_location: str, content):
    with open(file_location, 'wb') as file:
        pickle.dump(content, file, protocol=pickle.HIGHEST_PROTOCOL)

def load_file(file_location: str):
    with open(file_location, 'rb') as file:
        loaded_file = pickle.load(file)
    return loaded_file
save_file("doc_vectors.pkl",doc_vectors)
doc_vectors = load_file("doc_vectors.pkl")

In [None]:

    

def compute_relevance_scores(query_text: str) -> List[str]:
    """Compute relevance scores between a query vector and all document vectors."""
    query_tokens = preprocess_text(query_text)
    query_vec = vectorize_documents([query_tokens])[0].reshape(1, -1)
    similarities = cosine_similarity(doc_vectors, query_vec).flatten()
    top_10_indices = similarities.argsort()[-10:][::-1]
    return [list(corpus.keys())[index] for index in top_10_indices]


In [None]:
def compute_precision_recall_at_k(relevant_docs, retrieved_docs, k):
    y_true = [1 if doc_id in relevant_docs else 0 for doc_id in retrieved_docs[:k]]
    true_positives = sum([1 for i in range(len(y_true)) if y_true[i] == 1])
    recall_at_k = true_positives / len(relevant_docs)
    precision_at_k = true_positives / k
    print(f"Recall@{k}: {recall_at_k}")
    print(f"Precision@{k}: {precision_at_k}")
    return precision_at_k, recall_at_k



In [None]:
def calculate_recall_precision(query_id):
    relevant_docs = []
    retrieved_docs = []
    
    for qrel in dataset.qrels_iter():
        if qrel.query_id == query_id:
            if qrel.relevance > 0:
                relevant_docs.append(qrel.doc_id)

    for query in dataset.queries_iter():
        if query.query_id == query_id:
            retrieved_docs = compute_relevance_scores(query.text)
            break
    
    truncated_retrieved_docs = retrieved_docs[:len(relevant_docs)]
    compute_precision_recall_at_k(relevant_docs, retrieved_docs, 10)

# Evaluate queries
queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel.query_id: ''})

for query_id in list(queries_ids.keys()):
    calculate_recall_precision(query_id)

In [None]:

def calculate_MAP(query_id):
    relevant_docs = []
    retrieved_docs = []

    # Get relevant documents for the query
    for qrel in dataset.qrels_iter():
        if qrel.query_id == query_id and qrel.relevance > 0:
            relevant_docs.append(qrel.doc_id)

    # Get retrieved documents for the query
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            retrieved_docs = compute_relevance_scores(query.text)
            break

    # Compute mean average precision
    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(retrieved_docs) and retrieved_docs[j] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

map_sum = 0
for query_id in list(queries_ids.keys()):
    map_sum += calculate_MAP(query_id)

print(f"Mean Average Precision : {map_sum / len(queries_ids)}")

In [None]:

def calculate_MRR(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel.query_id == query_id and qrel.relevance > 0:
            relevant_docs.append(qrel.doc_id)
    
    retrieved_docs = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            retrieved_docs = compute_relevance_scores(query.text)
            break

    for i, result in enumerate(retrieved_docs):
        if result in relevant_docs:
            return 1 / (i + 1)

    return 0

queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel.query_id: ''})

mrr_sum = 0
for query_id in list(queries_ids.keys()):
    mrr_sum += calculate_MRR(query_id)

print(f"Mean Reciprocal Rank : {mrr_sum / len(queries_ids)}")