##Importing Libraries and mounting google drive.

In [2]:
#Mounting google drive to access documents
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import os
import re
import numpy as np
from collections import defaultdict
from math import log

##Preprocessing text

In [3]:
#Preprocessing function to clean and tokenize text
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

##Documents and Queries are loaded


In [10]:
#Load documents from the provided folder path
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = preprocess(file.read())
    return docs

In [11]:
#Loading queries from the provided query file
def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

##Computing Term Frequencies and Document Frequencies

In [6]:
#Computing term frequencies and document frequencies for each word in the documents
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int) #Counting how many docs contain each term
    term_freq = defaultdict(lambda: defaultdict(int)) #Counting term frequency in each document

    for doc_id, words in docs.items():
        word_set = set(words) #Getting unique words in the document
        for word in words:
            term_freq[doc_id][word] += 1 #Counting occurrences of each word
        for word in word_set:
            term_doc_freq[word] += 1 #Counting how many documents contain the word

    return term_freq, term_doc_freq, doc_count

##Computing Relevance Probabilities using BIM


In [7]:
#Compute relevance probabilities using BIM
def compute_relevance_prob(query, term_freq, term_doc_freq, doc_count):
    scores = {}
    for doc_id in term_freq:
        score = 1.0 #Initializing score for each document
        for term in query:
            tf = term_freq[doc_id].get(term, 0) #Getting term frequency in the document
            df = term_doc_freq.get(term, 0) #Getting document frequency of the term
            #Calculating probability of the term being relevant
            p_term_given_relevant = (tf + 1) / (sum(term_freq[doc_id].values()) + len(term_doc_freq))
            #Calculating probability of the term being non-relevant
            p_term_given_not_relevant = (df + 1) / (doc_count - df + len(term_doc_freq))
            score *= (p_term_given_relevant / p_term_given_not_relevant) #Updating document score
        scores[doc_id] = score
    return scores

##Retrieving and Ranking Documents

In [12]:
# Retrieving documents based on queries and ranking them by relevance scores
def retrieve_documents(folder_path, query_file_path):
    docs = load_documents(folder_path)
    queries = load_queries(query_file_path)

    term_freq, term_doc_freq, doc_count = compute_statistics(docs)

    for query in queries:
        query_terms = preprocess(query)  # Tokenizing the query
        scores = compute_relevance_prob(query_terms, term_freq, term_doc_freq, doc_count)
        ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)  # Ranking docs by score

        # Printing top 3 ranked documents in a specified format
        print(f"Top 3 Relevance Scores for query {query}:")
        for rank, (doc_id, score) in enumerate(ranked_docs[:3], 1):
            print(f"Rank {rank}: {doc_id}, Score: {score:.4f}")
        print()

In [13]:
#Example usage to retrieve documents based on queries
folder_path = '/content/drive/MyDrive/Project Dataset'
query_file_path = '/content/drive/MyDrive/quert.txt/query.txt'
retrieve_documents(folder_path, query_file_path)

Top 3 Relevance Scores for query Patient:
Rank 1: Pacemaker Implantation.txt, Score: 0.0850
Rank 2: Spinal Infusion.txt, Score: 0.0848
Rank 3: Breast Augmentation.txt, Score: 0.0844

Top 3 Relevance Scores for query Surgery:
Rank 1: Cataract Surgery.txt, Score: 0.1999
Rank 2: Spinal Infusion.txt, Score: 0.1696
Rank 3: Spinal Fusion Surgery.txt, Score: 0.1646

Top 3 Relevance Scores for query Complication:
Rank 1: Pacemaker Implantation.txt, Score: 0.1274
Rank 2: Cesarean Section.txt, Score: 0.1243
Rank 3: Spinal Infusion.txt, Score: 0.0848

Top 3 Relevance Scores for query Treatment:
Rank 1: Liver Transplant.txt, Score: 0.1243
Rank 2: Mastectomy.txt, Score: 0.1237
Rank 3: Hernia Repair.txt, Score: 0.1211

Top 3 Relevance Scores for query Outcome:
Rank 1: Hernia Repair.txt, Score: 0.1211
Rank 2: Pacemaker Implantation.txt, Score: 0.0850
Rank 3: Spinal Infusion.txt, Score: 0.0848

Top 3 Relevance Scores for query Recovery:
Rank 1: Spinal Infusion.txt, Score: 0.3422
Rank 2: ACL Reconstruc

##Assigning and Saving Random Relevance Scores

In [15]:
import random

#Assigning random relevance scores (0 for irrelevant, 1 for relevant)
def assign_random_relevance(queries, documents, relevance_scale=(0, 1)):
    relevance_scores = {}

    for query in queries:
        relevance_scores[query] = {}  #Initializing relevance score dictionary for the query
        for doc in documents:
            #Assigning a random relevance score (between 0 and 1 by default)
            relevance_scores[query][doc] = random.randint(relevance_scale[0], relevance_scale[1])

    return relevance_scores

#Saving the relevance scores to a file
def save_relevance_scores_to_file(relevance_scores, output_file):
    with open(output_file, 'w') as f:
        for query, doc_scores in relevance_scores.items():
            for doc, score in doc_scores.items():
                f.write(f"{query},{doc},{score}\n")  #Saving as query,document,score format


#Example usage to assign random relevance scores and saving them to a file
folder_path = '/content/drive/MyDrive/Project Dataset'
query_file_path = '/content/drive/MyDrive/quert.txt/query.txt'

#Loading documents and queries
documents = load_documents(folder_path)  #Returning a dictionary of document_id -> content
queries = load_queries(query_file_path)  #Returning a list of queries

#Randomly assigning relevance scores (0 = irrelevant, 1 = relevant)
random_relevance_scores = assign_random_relevance(queries, documents.keys())

#Saving the relevance scores to query_relevance_score.txt
output_file = 'query_relevance_score.txt'
save_relevance_scores_to_file(random_relevance_scores, output_file)

print(f"Relevance scores saved to {output_file}")


Relevance scores saved to query_relevance_score.txt
