# Vector Space Model 
### Cristian Curaba

In [1]:
# Importing dependancy libraries
import os
import pandas as pd
import numpy as np
import re
import math as m
from collections import Counter
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Defining global variables for directories
in_path_files=os.getcwd()+"/cranfieldDocs"
out_path_files=os.getcwd()+"/preprocessed_cranfieldDocs"
path=os.getcwd()

if not os.path.isdir(out_path_files):
    os.mkdir(out_path_files)


**Preprocessing Steps:**

- Transform the input text to lowercase.
- Remove punctuation, numbers.
- Exclude words with one or two characters in length.
- Perform stemming.
- Eliminate stopwords.


In [3]:
# Initiallizing Porter Stemmer object
st = PorterStemmer()

#Intializing regular expression object to remove words with one or two characters length
shortword = re.compile(r'\W*\b\w{1,2}\b')

# Intializing stopwords list
stop_list = stopwords.words('english')


def preprocess_text(input_text):
    """
    Preprocesses the input text. Converts to lower case,
    removes punctuation and numbers, splits on whitespaces, 
    removes stopwords, performs stemming & removes words with one or 
    two characters length.

    Arguments:
        input_text {string} -- text to be tokenized

    Returns:
        string -- string of tokens generated
    """

    # Converting to lower case
    text_lower = input_text.lower()

    # Removing anythig that is not a word character or a whitespace
    text_no_punct = re.sub(r'[^\w\s]', '', text_lower)

    # Removing numbers
    text_no_numbers = re.sub(r'[0-9]', '', text_no_punct)

    # Removing tokens with one or two characters length
    text_no_short_words = shortword.sub('', text_no_numbers)

    # Splitting on whitespaces to generate tokens
    tokens = text_no_short_words.split()

    # Removing stop words from the tokens
    clean_tokens = [word for word in tokens if word not in stop_list]

    # Stemming the tokens
    stem_tokens = [st.stem(word) for word in clean_tokens]

    # Checking for stopwords again
    clean_stem_tokens = [word for word in stem_tokens if word not in stop_list]

    # Converting list of tokens to string
    clean_stem_tokens_final = ' '.join(map(str,  clean_stem_tokens))
    

    return clean_stem_tokens_final


In [4]:
#Utility function to extract tokens from a file
def extractTokens(beautifulSoup, tag):
    """Extract tokens of the text between a specific SGML <tag>.

    Arguments:
        beautifulSoup {bs4.BeautifulSoup} -- soup bs object formed using text of a file
        tag {string} -- target SGML <tag>

    Returns:
        string -- string of tokens extracted from text between the target SGML <tag>
    """

    # find the tag and get its text content
    tag_content = beautifulSoup.find(tag).text if beautifulSoup.find(tag) else ''

    return tag_content


Performed pre-processing phase to corpus and queries.

Here I joined title and text of docs toghether, an improvement is possible: giving more relevance for title occurences.

In [5]:
filenames = os.listdir(in_path_files)

for fname in filenames:

    # generate filenames
    infilepath = os.path.join(in_path_files, fname)
    outfilepath = os.path.join(out_path_files, fname)


    with open(infilepath) as infile:
        with open(outfilepath, 'w') as outfile:

            # read all text in a file
            fileData = infile.read()

            # creating BeautifulSoup object to extract text between SGML tags
            soup = BeautifulSoup(fileData)

            # extract tokens for <title>
            title = extractTokens(soup, 'title')

            # extract tokens for <text>
            text = extractTokens(soup, 'text')

            # preprocess tokens for <title>
            title = preprocess_text(title)

            # preprocess tokens for <text>
            text = preprocess_text(text)
            
            # write tokens for <title> into new file
            outfile.write(title)
            outfile.write(" ")

            # write tokens for <text> into new file
            outfile.write(text)
        outfile.close()
    infile.close()

print("Preprocessing files done!")

#Preprocessing query file
query_file=os.getcwd()+"/queries.txt"
query_file_out=os.getcwd()+"/preprocessed_queries.txt"

# Preprocessing the queries.txt file
q = open(query_file, 'r')

# opening new file to write preprocessed tokens into
new_q = open(query_file_out, 'w')

# read each line of file seperately
text = q.readlines()
for line in text:
    
    # if condition to avoid newline character in the end of file
    if(line != text[-1]):
        query_tokens = preprocess_text(line.rstrip())
        new_q.write(query_tokens + '\n')
    else:
        query_tokens = preprocess_text(line.rstrip())
        new_q.write(query_tokens)

q.close()
new_q.close()

print("Preprocessing queries done!")

Preprocessing files done!
Preprocessing queries done!


In [6]:
# Generate a single list of all preprocessed docs
all_docs = []

for fname in filenames:
    outfilepath = out_path_files + '/' + fname
    with open(outfilepath) as file:
        fileData = file.read()
        all_docs.append(fileData)

### Creating tf-idf dictionary.

Build $\texttt{DF}$ dictionary with pairs $\texttt{(token, number of occurences)}$.

In [7]:
from collections import defaultdict

# Calculating document frequency
# Create a defaultdict with integers as default values
DF = defaultdict(int)

# Populate the dictionary with document occurrences for each token
for doc_index, doc in enumerate(all_docs):
    tokens = set(doc.split())
    for token in tokens:
        DF[token] += 1


Here I calculate the $\texttt{tf\_idf}= \texttt{tf}\times \texttt{idf}$ dictionary with all pairs $(\texttt{token, tf\_idf})$ where $$\texttt{tf}=\frac{\texttt{freq}_{t,d}}{\texttt{len}(d)}$$
 with $t$ token, $d$ document and $\texttt{freq}_{t,d}$ is the number of occurence of token $t$ in the document $d$;
$$
idf_t= \log \frac{N}{df_t}, 
$$ 
with $t$ token, $N$ nummber of docs and $df_t$ the document frequency of the token $t$.

In [8]:
# Calculating tf-idf values for each term in the corpus
# tf-idf_{t,d} = tf_{t,d} * idf_{t,d} = tf_{t,d} * log(N/df_{t})

# vocabulary of all the terms in the corpus
vocab = [term for term in DF]
# creating dictionary to store tf-idf values for each term in the vocabulary
tf_idf = {}

#doc is the index of the document in the corpus
doc = 0

for i in range(len(all_docs)):
    # tokenizing each document
    tokens = all_docs[i].split()
    
    # counter object to efficiently count number of occurence of a term in a particular document
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in tokens:
        
        # counting occurence of term in document using counter object
        tf = counter[token]/words_count
        
        # retrieving df values from DF dictionary
        df = DF[token] if token in vocab else 0
        
        # adding 1 to numerator & denominator to avoid divide by 0 error
        idf = np.log((len(all_docs)+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf
    doc += 1
    
#printing some tf-idf values
print(f'tf_idf value of \'approach\' in doc number 15: ', tf_idf[14, 'approach'])
print(f'tf_idf value of \'experi\' in doc number 1: ', tf_idf[0, 'experiment'])
print(f'tf_idf value of \'stress\' in doc number 1400: ', tf_idf[1399, 'stress'])

tf_idf value of 'approach' in doc number 15:  0.048112262483488724
tf_idf value of 'experi' in doc number 1:  0.00877713364724684
tf_idf value of 'stress' in doc number 1400:  0.08963980961566191


## Calculate cosine similarity

To simplfy, we build a TD_IDF matrix with size (number of docs, number of tokes). We will get a sparse matrix with consume more memory than a vocabulary.

In [9]:
# Building the tf-idf document-term matrix with numpy
TF_IDF = np.zeros((len(all_docs), len(vocab)))

# creating vector of tf-idf values
for pair in tf_idf:
    ind = vocab.index(pair[1])
    TF_IDF[pair[0]][ind] = tf_idf[pair]

In the next code snippet, we will calculate the cosine similarity between two vectors using the $\texttt{cosine\_similarity} $ function. This function takes two numpy arrays as input and returns the cosine similarity between them. We will also define the $\texttt{ranking}$ function, which will determine a ranked list of top k documents based on their cosine similarity with a given query. Finally, we will use the $\texttt{generate\_ranked\_document\_list}$ function to generate a ranked list of documents in descending order of their cosine similarity with the queries.


In [10]:
#Utility function to generate vector representation of tokens where v(d)\in R^|V|, [v(d)]_i = tf_{t_i,d} * idf_{t_i}
def generate_vector(tokens):
    """
    Create a vector based on the vocabulary for the given tokens.
    
    Args:
        tokens (list): List of tokens to be converted.
    
    Returns:
        numpy.ndarray: Vector representation of tokens.
    """
    vector = np.zeros(len(vocab))
    token_counts = Counter(tokens)
    total_words = len(tokens)

    for token in np.unique(tokens):
        term_frequency = token_counts[token] / total_words
        document_frequency = DF[token] if token in vocab else 0
        inverse_document_frequency = m.log((len(all_docs) + 1) / (document_frequency + 1))

        try:
            index = vocab.index(token)
            vector[index] = term_frequency * inverse_document_frequency
        except ValueError:
            pass

    return vector

#Utility function to calculate cosine similarity between 2 vectors
def cosine_similarity(x, y):
    """Calculate cosine similarity between 2 vectors (same dimension).
    Arguments:
        x {numpy.ndarray} -- vector 1
        y {numpy.ndarray} -- vector 2
    
    Returns:
        numpy.float64 -- cosine similarity between vector 1 & vector 2
    """
    if not (np.issubdtype(x.dtype, np.number) and np.issubdtype(y.dtype, np.number)):
        raise ValueError("Input vectors must have numeric data types.")
    if x.shape != y.shape:
        raise ValueError("Input vectors must have the same dimensions.")
    
    if np.all(x == 0) or np.all(y == 0):
        return 0
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))


In the next snippet there's the function to generate the ranked retrived list given the query and the number $k$ of retrieved documents.

In [11]:
# function to rank documents based on cosine similarity with query

def ranking(k, query, TF_IDF):
    """Determines a ranked list of top k documents in descending order of their
    cosine similarity with the query
    
    Arguments:
        k {integer} -- top k documents to retrieve (if k=0, retrieve all documents)
        query {string} -- query whose cosine similarity has to be computed with the corpus
    
    Returns:
        numpy.ndarray -- list of indexes of top k cosine similarities between query and corpus of documents
    """
    if k < 0:
        print("Please enter a value greater than or equal to 0")
        return
    if(len(query) == 0):
        print("Please enter a valid query")
        return

    
    document_cosine_similarities = []
    
    for doc_id, document_vector in enumerate(TF_IDF):
            
        if type(query) is str:
            # if query is a string of tokens convert it to a vector using tf-idf
            query_vector = generate_vector(query.split())
        
        if type(query) is np.ndarray:
            # if query is already a vector, use it as it is
            query_vector = query
        
        if(len(query_vector) != len(vocab)):
            print("Error on generating vector for query")
            return
        document_cosine_similarities.append(cosine_similarity(query_vector, document_vector))
        
    if k == 0:
        # k=0 to retrieve all documents in descending order
        output_indices = np.array(document_cosine_similarities).argsort()[::-1]
        
    else:
        # to retrieve the top k documents in descending order    
        output_indices = np.array(document_cosine_similarities).argsort()[-k:][::-1]
    
    return output_indices


## Vector Model Evaluation: precision and recall.

For the evaluation we consider the $\texttt{relevance.txt}$ file which is given.

In [12]:
#Get the relevance of each document for each query
# NB: queries and docs numbered from 1.
column_names = ['Query_ID', 'Relevance']
relevance_df = pd.read_csv(f"{path}/relevance.txt", delim_whitespace=True, names=column_names, header=None)
relevance_df['Relevance']=relevance_df['Relevance'].astype(int)
relevance_df=relevance_df - 1 # to make the indexing of queries and docs start from 0
print(relevance_df.head())

query_file = open(path+'/preprocessed_queries.txt', 'r')
queries = query_file.readlines()


query_relevance_lists = []
for query_id in range(len(queries)):
    relevant_documents = relevance_df[relevance_df['Query_ID'] == query_id]['Relevance'].to_list()
    query_relevance_lists.append(relevant_documents)
print(f'Relevant docs for each query: ', query_relevance_lists)

   Query_ID  Relevance
0         0        155
1         1        665
2         1        666
3         1       1257
4         1       1393
Relevant docs for each query:  [[155], [665, 666, 1257, 1393, 667, 669, 1203, 1390, 1394, 1299, 36, 558, 629, 1106, 1212], [23, 100, 665, 666, 92, 1257, 1392, 558, 629, 661, 1103, 1106, 1203, 1212, 1299], [1390, 665, 666, 1257, 1077, 1079, 1080, 1393, 1394, 1213, 1197, 1203, 1299, 558, 629, 661, 1106, 1212], [1382, 1384, 154, 240, 1381, 1369, 1385, 110, 1383, 149, 291, 457, 478, 976, 375, 458, 1364, 61, 1365], [154, 1382, 1384, 1381, 61, 291, 240, 1369, 1383, 457, 458, 460, 1385, 1364, 1365, 110, 149, 478], [399, 418, 1386, 411, 1391, 1397, 1396, 1399, 1398], [399, 1386, 1391, 1397], [655, 1312, 1316, 1315, 1317, 1318, 1156, 1273], [1378, 1304, 1303, 39, 292, 1308, 160, 420, 1376, 1377, 1380, 224, 1379, 447, 448, 1123, 1279, 432, 922, 923, 1061, 1073, 1074, 1212]]


In the following snippet we calculate the recall for a given k (the number of retrieved documents).

In [13]:
# Calculating recall= number of relevant documents retrieved/total number of relevant documents
# Get the relevance of each document for each query
def calculate_recall_at_k(k=50):
    """
    Calculate recall values for each query given the number of top documents to be retrieved (k).

    Args:
        k (int): Number of top documents to be retrieved.

    Returns:
        list: List of recall values for each query.
    """
    recall_values = []

    for query_id in range(len(queries)):
        relevant_documents = query_relevance_lists[query_id] # Solution
        top_k_documents = ranking(k, queries[query_id], TF_IDF)
        #with TF-IDF= np.random.rand(len(all_docs), len(vocab)) results are worse (avg is a quarter)

        # Number of relevant documents retrieved
        intersect_count = len([value for value in top_k_documents if value in relevant_documents])

        # Total number of relevant documents
        total_relevant_documents = len(relevant_documents)

        # Calculate recall for the current query
        recall = intersect_count / total_relevant_documents if total_relevant_documents > 0 else 0
        recall_values.append(recall)

    return recall_values

recall_results = calculate_recall_at_k(50)
#rounding off to 2 decimal places
recall_results = [round(num, 2) for num in recall_results]
print(f'Recall values: ', recall_results)
print(f'Mean recall: ', round(np.mean(recall_results), 3))

Recall values:  [0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.11, 0.0, 0.0, 0.04]
Mean recall:  0.026


In the following snippet we calculate the precision for a given k (the number of retrieved documents).

In [14]:
#Calculating precision= number of relevant documents retrieved/total number of documents retrieved
def calculate_precision_at_k(k=50):
    """To generate list of precision values for each query for given value of k
    
    Arguments:
        k {[type]} -- number of top documents to be retrieved
    
    Returns:
        list -- list of precision values for each query
    """
    precision_values = []
    for query_id in range(len(queries)):
        relevant_documents = query_relevance_lists[query_id] # Solution
        
        top_k_documents = ranking(k, queries[query_id], TF_IDF)

        # Number of relevant documents retrieved
        intersect_count = len([value for value in top_k_documents if value in relevant_documents])

        # Calculate recall for the current query
        precision = intersect_count / k
        precision_values.append(precision)

    return precision_values
#rounding off to 2 decimal places
precision_results = [round(num, 2) for num in calculate_precision_at_k(50)]
calculate_precision_at_k(50)

[0.0, 0.0, 0.0, 0.0, 0.04, 0.0, 0.02, 0.0, 0.0, 0.02]

We can perform the analysis of recall and precision for different values of retrieved documents. Here I save the values on the text file: $\texttt{accuracy\_without\_feedback}$.

In [15]:
# Writing precision and recall values to file
list_of_k = [20, 50, 100, 200]
# If file already exists, delete it
if not os.path.exists('accuracy_without_feedbacks'):
    with open('accuracy_without_feedbacks', 'w') as file:
        for k in list_of_k:
            p = calculate_precision_at_k(k)
            r = calculate_recall_at_k(k)
            
            file.write(f"Top {k} documents in the rank list\n")
            
            for i in range(len(queries)):
                approximated_recall = round(r[i], 2)
                file.write("Query: {0} \t Pr: {1} \t Re: {2}\n".format(i+1, p[i], approximated_recall))
            
            avg_precision = round(np.mean(p), 2)
            avg_recall = round(np.mean(r), 2)
            
            file.write("Avg Precision: {0}\n".format(avg_precision))
            file.write("Avg Recall: {0}\n\n".format(avg_recall))


## Relevance Feedback

In the following snippets I will implement the Rocchio algorithm to improve the information retrieval systems. The feedback information must be stored in the $\texttt{relevant\_docs}$ variable (list of relevant documents for each query). 

To consider custom relevance feedback just modify the $\texttt{custom\_relevance.txt}$ file (with same structure of $\texttt{relevance.txt}$ file: list of {query_id, doc_id} pairs).

In [16]:
# Get the relevance of each document for each query
relevant_docs= query_relevance_lists #here using the relevance list given by the dataset as default

#if existing,considering the custom relevance list (dafult is the one given by the dataset, a.k.a. relevance.txt)
if os.path.exists(path + '/custom_relevance.txt'):
    with open(path+ '/custom_relevance.txt', 'r') as file:
        custom_relevance_df = pd.read_csv(f"{path}/custom_relevance.txt", delim_whitespace=True, names=column_names, header=None)
        custom_relevance_df['Relevance']=relevance_df['Relevance'].astype(int)
        custom_relevance_df=relevance_df - 1 
    query_relevance_lists = []
    for query_id in range(len(queries)):
        relevant_documents = relevance_df[relevance_df['Query_ID'] == query_id]['Relevance'].to_list()
        query_relevance_lists.append(relevant_documents)
else:
    print("No custom relevance file found. Using default relevance list.")

In [17]:
#Rocchio Algorithm
# Intializing alpha, beta, gamma values
alpha = 1
beta = 3 #0.75
gamma = 0.15

# Calculate centroid of relevant/irrelevant documents
def calculate_centroid(docs_indexes):
    """
    Calculate centroid of relevant/irrelevant documents.

    Args:
        docs_indexes (list): List of relevant documents.

    Returns:
        numpy.ndarray: Centroid vector of relevant documents.
    """
    centroid = np.zeros(len(vocab))


    for document_id in docs_indexes:
        if(document_id >= len(all_docs)):
            print("Invalid document index")
            return
        if(len(TF_IDF[document_id,:]) != len(vocab)):
            print("Error on generating vector for document")
            return
        centroid += np.array(TF_IDF[document_id,:]).flatten()

    return centroid / len(docs_indexes)


def calculate_new_query_vector(query_id, relevant_docs_index ,alpha=1, beta=0.75, gamma=0.15):
    """
    Calculate new query vector based on Rocchio algorithm.

    Args:
        query_id (int): Query ID.
        relevant_documents_index (list): List of indexes relevant documents.
        alpha (float): Weight for the original query vector.
        beta (float): Weight for the centroid of relevant documents.
        gamma (float): Weight for the centroid of non-relevant documents.

    Returns:
        numpy.ndarray: New query vector.
    """
    original_query_vector = generate_vector(queries[query_id].split())
    # Get the list of relevant and non-relevant document IDs for the current query
    non_relevant_docs_index= [doc_id for doc_id in range(len(all_docs)) if doc_id not in relevant_docs_index]
    #print(non_relevant_document_ids)
    # Calculate Rocchio algorithm components
    relevant_centroid = calculate_centroid(relevant_docs_index)
    non_relevant_centroid = calculate_centroid(non_relevant_docs_index)

    # Calculate new query vector using Rocchio algorithm
    new_query_vector = alpha * original_query_vector + beta * relevant_centroid - gamma * non_relevant_centroid

    return new_query_vector


Now we can evaluate the performance in the same way as before. Obviously by considering the solution as feedback and high value of $\beta$ (which represents the weight of the centroid given by the feedback) we obtain great results.

In [22]:
# Calculating recall= number of relevant documents retrieved/total number of relevant documents

# Get the relevance of each document for each query
def calculate_recall_at_k_with_feedback(k=50):
    """
    Calculate recall values for each query given the number of top documents to be retrieved (k).

    Args:
        k (int): Number of top documents to be retrieved.

    Returns:
        list: List of recall values for each query.
    """
    recall_values = []

    for query_id in range(len(queries)):
        relevant_documents = query_relevance_lists[query_id]
        new_query = calculate_new_query_vector(query_id, relevant_documents, alpha, beta, gamma)
        top_k_documents = ranking(k, new_query, TF_IDF)
        # Number of relevant documents retrieved
        intersect_count = len([value for value in top_k_documents if value in relevant_documents])

        # Total number of relevant documents
        total_relevant_documents = len(relevant_documents)

        # Calculate recall for the current query
        recall = intersect_count / total_relevant_documents
        recall_values.append(recall)

    return recall_values

#rounding off to 2 decimal places
recall_results = [round(num, 2) for num in calculate_recall_at_k_with_feedback(30)]
print(f'Recall values with relevant feedback giveng by the solution:', recall_results)

#Calculating precision= number of relevant documents retrieved/total number of documents retrieved
def calculate_precision_at_k_with_feedback(k=50):
    """To generate list of precision values for each query for given value of k
    
    Arguments:
        k {[type]} -- number of top documents to be retrieved
    
    Returns:
        list -- list of precision values for each query
    """
    precision_values = []
    for query_id in range(len(queries)):
        relevant_documents = query_relevance_lists[query_id]
        new_query = calculate_new_query_vector(query_id, relevant_documents, alpha, beta, gamma)
        top_k_documents = ranking(k, new_query, TF_IDF)
        # Number of relevant documents retrieved
        intersect_count = len([value for value in top_k_documents if value in relevant_documents])
        # Calculate recall for the current query
        precision = intersect_count / k
        precision_values.append(precision)

    return precision_values
#rounding off to 2 decimal places
precision_results = [round(num, 3) for num in calculate_precision_at_k_with_feedback(30)]
print(f'Precision values with relevant feedback giveng by the solution:', precision_results)

Recall values with relevant feedback giveng by the solution: [1.0, 0.13, 0.0, 0.33, 0.16, 0.39, 0.11, 1.0, 1.0, 0.12]
Precision values with relevant feedback giveng by the solution: [0.033, 0.067, 0.0, 0.2, 0.1, 0.233, 0.033, 0.133, 0.267, 0.1]


## Psuedo Relevance Feedback


In the following snippet I implement the pseudo relevance feedback: the idea is to assume the top $k$ document retrieved by our vector model as feedback information and then update the query vector with this information as done before.

In [20]:
# Calculating recall= number of relevant documents retrieved/total number of relevant documents

# Get the relevance of each document for each query
def calculate_recall_at_k_with_pseudo_feedback(k=50):
    """
    Calculate recall values for each query given the number of top documents to be retrieved (k).

    Args:
        k (int): Number of top documents to be retrieved.

    Returns:
        list: List of recall values for each query.
    """
    recall_values = []
    for query_id in range(len(queries)):
        relevant_documents = query_relevance_lists[query_id] # Solution
        
        top_k_documents = ranking(k, queries[query_id], TF_IDF)
    
        new_query = calculate_new_query_vector(query_id, top_k_documents, alpha, beta, gamma)

        top_k_documents = ranking(k, new_query, TF_IDF)

        # Number of relevant documents retrieved
        intersect_count = len([value for value in top_k_documents if value in relevant_documents])

        # Total number of relevant documents
        total_relevant_documents = len(relevant_documents)

        # Calculate recall for the current query
        recall = intersect_count / total_relevant_documents
        recall_values.append(recall)

    return recall_values

#rounding off to 2 decimal places
recall_results = [round(num, 2) for num in calculate_recall_at_k_with_pseudo_feedback(50)]
print(f'Recall values with pseudo relevant feedback:', recall_results)
print(f'Mean recall with pseudo relevant feedback:', round(np.mean(recall_results), 3))

Recall values with pseudo relevant feedback: [0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.11, 0.0, 0.0, 0.04]
Mean recall with pseudo relevant feedback: 0.026


In [21]:
# Calculating recall= number of relevant documents retrieved/total number of relevant documents

# Get the relevance of each document for each query
def calculate_precision_at_k_with_pseudo_feedback(k=50):
    """
    Calculate precision values for each query given the number of top documents to be retrieved (k).

    Args:
        k (int): Number of top documents to be retrieved.

    Returns:
        list: List of recall values for each query.
    """
    precision_values = []
    for query_id in range(len(queries)):
        relevant_documents = query_relevance_lists[query_id] # Solution
        
        top_k_documents = ranking(k, queries[query_id], TF_IDF)
    
        new_query = calculate_new_query_vector(query_id, top_k_documents, alpha, beta, gamma)

        top_k_documents = ranking(k, new_query, TF_IDF)

        # Number of relevant documents retrieved
        intersect_count = len([value for value in top_k_documents if value in relevant_documents])

        # Total number of relevant documents
        total_relevant_documents = len(relevant_documents)
        precision = intersect_count / k
        precision_values.append(precision)

    return precision_values

#rounding off to 2 decimal places
precision_results = [round(num, 3) for num in calculate_precision_at_k_with_pseudo_feedback(30)]
print(f'Precision values with relevant feedback giveng by the solution:', precision_results)
print(f'Mean precision with pseudo relevant feedback:', round(np.mean(precision_results), 3))

Precision values with relevant feedback giveng by the solution: [0.0, 0.0, 0.0, 0.0, 0.033, 0.0, 0.0, 0.0, 0.0, 0.0]
Mean precision with pseudo relevant feedback: 0.003
