In [1]:
import ir_datasets
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk import pos_tag


In [2]:
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")


In [3]:
corpus = {}
counter = 0

for doc in dataset.docs_iter():
  
        corpus[doc.doc_id]= doc.title+" " + doc.summary+" " +doc.detailed_description+ " "+doc.eligibility
 

In [4]:
 documents = list(corpus.values())
documents

['Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets \n    \n      This study will test the ability of extended release nifedipine (Procardia XL), a blood\r\n      pressure medication, to permit a decrease in the dose of glucocorticoid medication children\r\n      take to treat congenital adrenal hyperplasia (CAH).\r\n    \n   \n    \n      This protocol is designed to assess both acute and chronic effects of the calcium channel\r\n      antagonist, nifedipine, on the hypothalamic-pituitary-adrenal axis in patients with\r\n      congenital adrenal hyperplasia. The multicenter trial is composed of two phases and will\r\n      involve a double-blind, placebo-controlled parallel design. The goal of Phase I is to examine\r\n      the ability of nifedipine vs. placebo to decrease adrenocorticotropic hormone (ACTH) levels,\r\n      as well as to begin to assess the dose-dependency of nifedipine effects. The goal of Phase II\r\n      is to evaluate the long-term effects o

In [5]:
def custom_tokenizer(text: str) -> list[str]:
    # Tokenize text using your custom logic
    tokens = word_tokenize(text.lower())
    return tokens

In [6]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from spellchecker import SpellChecker
import string

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

def get_wordnet_pos(tag):
    # """Converts POS tag to a format that WordNetLemmatizer can understand."""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def correct_sentence_spelling(tokens):
    # """Corrects spelling of tokens."""
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    for i, token in enumerate(tokens):
        if token in misspelled:
            corrected = spell.correction(token)
            if corrected is not None:
                tokens[i] = corrected
    return tokens

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)
    
    # # Spell Checking
    # words = correct_sentence_spelling(words)
    
    # Remove Punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove Stop Words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Part of Speech Tagging
    pos_tags = pos_tag(stemmed_words)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return ' '.join(lemmatized_words)


In [7]:
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, preprocessor=preprocess_text)

# Fit the vectorizer to the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# df = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names_out(), index=corpus.keys())


df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vectorizer.get_feature_names_out(), index=corpus.keys())

print(df)



               0  000  0002  000mm  000mm3  001  002  0025  0039  004  ...  \
NCT00000102  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00000104  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00000105  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00000106  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00000107  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
...          ...  ...   ...    ...     ...  ...  ...   ...   ...  ...  ...   
NCT00003469  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00003470  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00003471  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00003472  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   
NCT00003473  0.0  0.0   0.0    0.0     0.0  0.0  0.0   0.0   0.0  0.0  ...   

               •    ≤  ≤15   ≤2    ≥  ≥1000mgm2day  ≥1800ml   ≥

In [8]:
import pickle
import os
# from utils.files_handling.files_locations import FilesLocations
from gensim.models import Word2Vec
def save_file(file_location: str, content):
        if os.path.exists(file_location):
            os.remove(file_location)
        with open(file_location, 'wb') as handle:
            pickle.dump(content, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_file(file_location: str):
        with open(file_location, 'rb') as handle:
            content = pickle.load(handle)
        return content

def save_tfidf_data(tfidf_matrix, tfidf_model):
        # if FilesLocations.DATASET_NAME.value == "antique":
            save_file("tfidf_matrix.pickle", tfidf_matrix)
            save_file("tfidf_model.pickle", tfidf_model)


In [9]:
tfidf_model = vectorizer


In [10]:
save_tfidf_data( tfidf_matrix, tfidf_model)    

In [11]:
tfidf_matrix =  load_file("tfidf_matrix.pickle")

In [12]:
import ir_datasets
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")
counter=1
for query in dataset.queries_iter():
    print(query.text) # namedtuple<query_id, text>
    print()
    counter+=1
    if counter >= 1:
      break


Patient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. 




In [13]:
query

GenericQuery(query_id='1', text='\nPatient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. \n')

In [15]:
query

GenericQuery(query_id='1', text='\nPatient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle. \n')

In [16]:
similarities = cosine_similarity(tfidf_matrix, query_vector)
        sorted_indices = similarities.argsort(axis=0)[::-1][self.start_index:self.end_index].flatten()

        top_indices = []
        for i in sorted_indices:
            if similarities[i] >= self.threshold:
                top_indices.append(i.item())

        col = get_mongo_client()["IR"][dataset]
        unordered_results = list(col.find({'index': {'$in': top_indices}}))
        return sorted(unordered_results, key=lambda x: top_indices.index(x['index']))

IndentationError: unexpected indent (3532667440.py, line 2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assume 'query_text' is the user's query
query_tfidf = vectorizer.transform([query.text])  # Use the query text from your dataset

# Compute cosine similarity between query and document vectors
cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix)

# Get the indices of top-k most similar documents
k = 10  # Choose the number of top results you want
top_indices = cosine_similarities.argsort()[0][-k:][::-1]

# Retrieve the relevant documents based on the top indices
relevant_documents = [doc.doc_id for doc in dataset.docs_iter()][top_indices]

print("Top relevant documents:", relevant_documents)

In [None]:
import numpy as np

def retrieve_and_rank(query_text, tfidf_matrix, corpus):
    # Preprocess the query text
    preprocessed_query = preprocess_text(query_text)
    
    # Transform the query into a TF-IDF vector
    query_vector = tfidf_model.transform([preprocessed_query])
    
    # Calculate cosine similarity between query and documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
    
    # Get indices of top-k documents (e.g., top 10)
    top_k_indices = np.argsort(similarity_scores[0])[::-1][:10]
    
    # Retrieve relevant documents
    search_results = [(doc_id, corpus[doc_id]) for doc_id in corpus.keys() if doc_id in top_k_indices]
    
    return search_results


In [None]:

# Example usage
user_query = query.text
search_results = retrieve_and_rank(user_query, tfidf_matrix, corpus)
for rank, (doc_id, content) in enumerate(search_results, start=1):
    print(f"Rank {rank}: {doc_id} - {content[:100]}...")
print("DDD")    


In [None]:
query

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def process_query(query: str, tfidf_model, tfidf_matrix):
    query_tfidf = tfidf_model.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    ranked_doc_indices = cosine_similarities.argsort()[::-1]
    return ranked_doc_indices, cosine_similarities

# Load the saved TF-IDF matrix and model
tfidf_matrix = load_file("tfidf_matrix.pickle")
tfidf_model = load_file("tfidf_model.pickle")

# Example query
queryTest = query.text
preprocessed_query = preprocess_text(queryTest)
ranked_indices, similarities = process_query(preprocessed_query, tfidf_model, tfidf_matrix)

# Display the top 10 results
for idx in ranked_indices[:10]:
    doc_id = list(corpus.keys())[idx]
    similarity_score = similarities[idx]
    ranked_indices_score = ranked_indices[idx]
    document_content = corpus[doc_id]
    print(f"Doc ID: {doc_id}, Similarity: {similarity_score:.4f}")
    print(f"Content: {document_content[:500]}...")  # Display the first 500 characters for brevity
    print(f"ranked_indices_score: {ranked_indices_score}")  # Display the first 500 characters for brevity

    print("\n" + "-"*80 + "\n")

Doc ID: NCT00003176, Similarity: 0.3128
Content: Temozolomide and Carmustine in Treating Patients With Anaplastic Glioma 
    
      RATIONALE: Drugs used in chemotherapy use different ways to stop tumor cells from dividing so
      they stop growing or die. Combining more than one drug may kill more tumor cells.

      PURPOSE: Phase II trial to study the effectiveness of temozolomide and carmustine in treating
      patients with anaplastic glioma.
    
   
    
      OBJECTIVES: I. Evaluate the activity, measured in terms of progressio...
ranked_indices_score: 53

--------------------------------------------------------------------------------

Doc ID: NCT00003470, Similarity: 0.2736
Content: Antineoplaston Therapy in Treating Patients With Anaplastic Astrocytoma 
    
      RATIONALE: Current therapies for adults with anaplastic astrocytomas that have not responded
      to standard therapy provide very limited benefit to the patient. The anti-cancer properties
      of Antineoplas

In [20]:
import ir_datasets
dataset = ir_datasets.load("clinicaltrials/2021/trec-ct-2021")
counter=1

for qrel in dataset.qrels_iter():
   print( qrel) # namedtuple<query_id, doc_id, relevance, iteration>
   print()
   counter+=1
   if counter >= 100:
    break

TrecQrel(query_id='1', doc_id='NCT00002569', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00002620', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00002806', relevance=0, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00002814', relevance=2, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003022', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003176', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003372', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003375', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003465', relevance=1, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003466', relevance=0, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003470', relevance=2, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003471', relevance=2, iteration='0')

TrecQrel(query_id='1', doc_id='NCT00003537', relevance=2, iteration='0')

TrecQrel(query_id='1', doc_id='NCT0000

In [27]:
import numpy as np

def vectorize(documents):
    documents_vectors = []
    i=0
    for document in documents:
        zero_vector = np.zeros(500)
        vectors = []
        for token in document:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    vectors.append(np.random(500))
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            documents_vectors.append(avg_vec)
        else:
            documents_vectors.append(zero_vector)
    return documents_vectors

In [30]:
def get_results(query_fin):
    query_tfidf = tfidf_model.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    # similarities = cosine_similarity(tfidf_matrix, vectorize([word_tokenize(process_text(query_fin))])[0].reshape(1, -1))

    sorted_indices = similarities.argsort(axis=0)[-10:][::-1].flatten()
    result_ids= []
    
    for i in sorted_indices:
        if(similarities[i][0]>=0.35):
            result_ids.append(int(i))

    unordered_results= list(col.find({'index':{'$in':result_ids} }))
    
    return sorted(unordered_results, key=lambda x: result_ids.index(x['index']))


In [31]:
def calculate_MAP(query_id):
    relevant_docs =[]
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id:
            relevant_docs.append(qrel[1])
    
    ordered_results=[]
    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results=get_results(query[1])
            break

    pk_sum=0
    total_relevant=0
    for i in range(1,11):
        relevant_ret=0
        for j in range(i):
            if(j<len(ordered_results) and ordered_results[j]['_id'] in relevant_docs):
                relevant_ret += 1
        p_at_k= (relevant_ret/(i)) * (1 if i-1<len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum+=p_at_k
        if(i-1<len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs):
            total_relevant+=1

    return 0 if total_relevant==0 else pk_sum/total_relevant


queries_ids={}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel[0]:''})

map_sum=0
for query_id in list(queries_ids.keys()):
    map_sum+= calculate_MAP(query_id)

print(map_sum/dataset.queries_count())

AttributeError: 'GenericQuery' object has no attribute 'lower'

In [32]:
import ir_datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming tfidf_model and tfidf_matrix are already defined and fitted
# tfidf_model = TfidfVectorizer().fit(corpus)
# tfidf_matrix = tfidf_model.transform(corpus)

def get_results(query_fin):
    query = query_fin.text  # Access the actual query text
    query_tfidf = tfidf_model.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    sorted_indices = cosine_similarities.argsort()[-10:][::-1]
    result_ids = [i for i in sorted_indices if cosine_similarities[i] >= 0.35]

    unordered_results = list(col.find({'index': {'$in': result_ids}}))

    return sorted(unordered_results, key=lambda x: result_ids.index(x['index']))

def calculate_MAP(query_id):
    relevant_docs = [qrel[1] for qrel in dataset.qrels_iter() if qrel.query_id == query_id]

    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = get_results(query)
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = sum(1 for j in range(i) if j < len(ordered_results) and ordered_results[j]['_id'] in relevant_docs)
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(ordered_results) and ordered_results[i - 1]['_id'] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(ordered_results) and ordered_results[i - 1]['_id'] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

# dataset = ir_datasets.load("antique/train")

queries_ids = {qrel.query_id: '' for qrel in dataset.qrels_iter()}

map_sum = sum(calculate_MAP(query_id) for query_id in queries_ids)

print(map_sum / len(queries_ids))

NameError: name 'col' is not defined