In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict,Counter
import re
import nltk
import pickle
import numpy as np
nltk.download('stopwords')

from nltk.corpus import stopwords
from tqdm import tqdm
import operator
from itertools import islice,count
from contextlib import closing

import json
from io import StringIO
from pathlib import Path
from operator import itemgetter
import pickle
import matplotlib.pyplot as plt

import math
from itertools import chain
import time

Implementation remarks:
* TF-IDF: use [sklearn TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). To deal with stopwords use the argument `stop_words`

TfidfVectorizer suggests handling with additional parameters, as you can read in the documentation. Make sure you read about them.

* Cosine Similarity: use [sklearn implementation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html).

* BM25: Implement a BM25 version according to the provided skeleton **without the use of packages**. 15 points as follows:
    * Prepare the data and filter stopwords. (5 points)
    * Implement two functions at BM25 class. (10 points)

* Ranking: implement topN functionallity (10 points)

**Later in this assignment, we will write code for TF-IDF and BM25 that utilize the inverted index.**

when working with BM25 without any package we need to filter the data and clean it by ourselves, We will do so utilizing NLTK stopwords.

In [None]:
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)
stopwords_frozen = frozenset(stopwords.words('english'))
def tokenize(text):
    """
    This function aims in tokenize a text into a list of tokens. Moreover, it filter stopwords.
    
    Parameters:
    -----------
    text: string , represting the text to tokenize.    
    
    Returns:
    -----------
    list of tokens (e.g., list of tokens).
    """
    list_of_tokens =  [token.group() for token in RE_WORD.finditer(text.lower()) if token.group() not in stopwords_frozen]    
    return list_of_tokens


# clean_data = [tokenize(doc) for doc in data]
# clean_data

This class below is a BM25 class but without any DL (documents length) this will help up with the calculation on the BM25 class (like calculate easily the AVDL).

second reason is that in the other class (**BM25_from_index**) we have improvments like when we compare queries to documents we can search just only on the documets that is represented in the posting list of the term and not on all the documents in the corpus every time (like the class BM25 below this cell)

In [None]:
class BM25:
    """
    Best Match 25.

    Parameters to tune
    ----------
    k1 : float, default 1.5

    b : float, default 0.75

    Attributes
    ----------
    tf_ : list[dict[str, int]]
        Term Frequency per document. So [{'hi': 1}] means
        the first document contains the term 'hi' 1 time.
        The frequnecy is normilzied by the max term frequency for each document.

    doc_len_ : list[int]
        Number of terms per document. So [3] means the first
        document contains 3 terms. 
        
    df_ : dict[str, int]
        Document Frequency per term. i.e. Number of documents in the
        corpus that contains the term.       

    avg_doc_len_ : float
        Average number of terms for documents in the corpus.

    idf_ : dict[str, float]
        Inverse Document Frequency per term.
    """

    def __init__(self,doc_len,df,tf=None,k1=1.5, b=0.75):
        self.b = b
        self.k1 = k1
        self.tf_ = tf
        self.doc_len_ = doc_len
        self.df_ = df
        self.N_ = len(doc_len)
        self.avgdl_ = sum(doc_len) / len(doc_len)        
        

    def calc_idf(self,query):
        """
        This function calculate the idf values according to the BM25 idf formula for each term in the query.
        
        Parameters:
        -----------
        query: list of token representing the query. For example: ['look', 'blue', 'sky']
        
        Returns:
        -----------
        idf: dictionary of idf scores. As follows: 
                                                    key: term
                                                    value: bm25 idf score
        """
        #idf =  ln((N−n(ti)+0.5n(ti)+0.5)+1) , where  N  is the total number of documents in the collection, and  n(ti)  is the number of documents containing  ti  (e.g., document frequency (df)).
        N = self.N_
        df = self.df_
        idf = {}
        for term in query:
          if term not in df:
            df_term = 0
          else:
            df_term = df[term]
          numerator = N - df_term + 0.5
          denominator = df_term + 0.5
          score = math.log((numerator / denominator) + 1)
          if term not in idf:
            idf[term] = score
          else:
            idf[term] += score
        return idf
        

    def search(self, queries):
        """
        This function use the _score function to calculate the bm25 score for all queries provided.
        
        Parameters:
        -----------
        queries: list of lists. Each inner list is a list of tokens. For example:
                                                                                    [
                                                                                        ['look', 'blue', 'sky'],
                                                                                        ['likes', 'blue', 'sun'],
                                                                                        ['likes', 'diamonds']
                                                                                    ]

        Returns:
        -----------
        list of scores of bm25
        """
        scores = []
        for query in queries:            
            scores.append([self._score(query, doc_id) for doc_id in range(self.N_)])
        return scores
    
    def _score(self, query, doc_id):
        """
        This function calculate the bm25 score for given query and document.
        
        Parameters:
        -----------
        query: list of token representing the query. For example: ['look', 'blue', 'sky']
        doc_id: integer, document id.
        
        Returns:
        -----------
        score: float, bm25 score.
        """
        B_Normalizer = 1- self.b + self.b * (self.doc_len_[doc_id] / self.avgdl_)
        doc_tf = self.tf_[doc_id]
        total_idf_dict = self.calc_idf(query)
        sui = 0
        for term in query:
          if term in doc_tf:
            tf_of_term_per_doc = doc_tf[term]
            numerator_tf_star = (self.k1 + 1) * tf_of_term_per_doc
            denominator_tf_star = (B_Normalizer * self.k1) + tf_of_term_per_doc
            score_per_term = (numerator_tf_star / denominator_tf_star) * total_idf_dict[term]
            sui += score_per_term
        return sui if sui > 0 else 0 

this below cell is for tests! later on we will reallocate this to the test class

In [None]:
# # Remove tokenizing and remove stopwords from queries
# clean_queries = [tokenize(query) for query in queries]
# # now we can search for results to our queries
# BM25_res = bm25.search(clean_queries)
# # data frame that represent the queries (as rows) on documents (as columns) 
# BM25_df = pd.DataFrame(data = BM25_res,index = [query_id for query_id in range(len(clean_queries))] ,columns = [doc_id for doc_id in range(len(data))])

# **The class of BM25**

The below cells represent configuration before running the test! for later on we need to reallocate them

**This is BM25 preprocessing data**


---


Which mean that here we take a list of strings(could be the body_text, the title_text or the anchor_text) and **return three objects as follows**:

a) doc_len: list of integer. Each element represents the length of a document.

b) tf: list of dictionaries. Each dictionary corresponds to a document as follows:

- key: term
- value: normalized term frequency (by the length of document)

for example, tf -> [{Burekas: 3, Haagala:2}, {Mi:3, Shemekir:1}, {Mekir:3, Burekas: 1, Haagala:4}] (each index in the list represent a doc)

c) df: dictionary representing the document frequency as follows:
- key: term  
- value: document frequency

for example, df -> {Burekas: 4, Haagala:6, Mi:3, Shemekir:1, Mekir:3}

In [None]:
def bm25_preprocess(data):
    """
    This function goes through the data and saves relevant information for the calculation of bm25. 
    Specifically, in this function, we will create 3 objects that gather information regarding document length, term frequency and
    document frequency.
    Parameters
    -----------
    data: list of lists. Each inner list is a list of tokens. 
    Example of data: 
    [
        ['sky', 'blue', 'see', 'blue', 'sun'],
        ['sun', 'bright', 'yellow'],
        ['comes', 'blue', 'sun'],
        ['lucy', 'sky', 'diamonds', 'see', 'sun', 'sky'],
        ['sun', 'sun', 'blue', 'sun'],
        ['lucy', 'likes', 'blue', 'bright', 'diamonds']
    ]
    
    Returns:
    -----------
    three objects as follows:
                a) doc_len: list of integer. Each element represents the length of a document.
                b) tf: list of dictionaries. Each dictionary corresponds to a document as follows:
                                                                    key: term
                                                                    value: normalized term frequency (by the length of document)

                                                                                               
                c) df: dictionary representing the document frequency as follows:
                                                                    key: term
                                                                    value: document frequency
    """      
    doc_len = []
    tf = []
    df = {}
    for elem in data:
      #for calculation of doc_len
      doc_len.append(len(elem))
      frequency = dict(Counter(elem))
      #normalize the tf for the len of the document
      for key in frequency:
        frequency[key] /= len(elem)
        if key not in df:
          df[key] = 1
        else:
          df[key] += 1
      #for calculation of tf
      tf.append(frequency)
    return doc_len,tf,df
# doc_len,tf,df = bm25_preprocess(clean_data)

# In this assignment, we are experimenting with two methods. ***TF-IDF*** and ***BM25***.  with a given query or queries we'll return a ranked list of documents to retrieve.

#### BM25 for carnfield data (15 points)
As a reminder:

To use BM25 we will need to following parameters:

* $k1$ and $b$ - free parameters
* $f(t_i,D)$ - term frequency of term $t_i$ in document $D$
* |$D$|- is the length of the document $D$ in terms 
* $avgdl$ -  average document length
* $IDF$ - which is calculted as follows: $ln(\frac{(N-n(t_i)+0.5}{n(t_i)+0.5)}+1)$, where $N$ is the total number of documents in the collection, and $n(t_i)$ is the number of documents containing $t_i$ (e.g., document frequency (df)).

Now, we will use the inverted index to fetch this information.

**We need to check only documents which are 'candidates' for a given query.**

In [None]:
import math
from itertools import chain
import time
# When preprocessing the data have a dictionary of document length for each document saved in a variable called `DL`.
class BM25_from_index:
    """
    Best Match 25.    
    ----------
    k1 : float, default 1.5

    b : float, default 0.75

    index: inverted index
    """

    def __init__(self,index,k1=1.5, b=0.75):
        self.b = b
        self.k1 = k1
        self.index = index
        self.N = len(DL)
        self.AVGDL = sum(DL.values())/self.N
        self.words, self.pls = zip(*self.index.posting_lists_iter())        

    def calc_idf(self,list_of_tokens):
        """
        This function calculate the idf values according to the BM25 idf formula for each term in the query.
        
        Parameters:
        -----------
        query: list of token representing the query. For example: ['look', 'blue', 'sky']
        
        Returns:
        -----------
        idf: dictionary of idf scores. As follows: 
                                                    key: term
                                                    value: bm25 idf score
        """        
        idf = {}        
        for term in list_of_tokens:            
            if term in self.index.df.keys():
                n_ti = self.index.df[term]
                idf[term] = math.log(1 + (self.N - n_ti + 0.5) / (n_ti + 0.5))
            else:
                pass                             
        return idf
        

    def search(self, queries,N=3):
        """
        This function calculate the bm25 score for given query and document.
        We need to check only documents which are 'candidates' for a given query. 
        This function return a dictionary of scores as the following:
                                                                    key: query_id
                                                                    value: a ranked list of pairs (doc_id, score) in the length of N.
        
        Parameters:
        -----------
        query: list of token representing the query. For example: ['look', 'blue', 'sky']
        doc_id: integer, document id.
        
        Returns:
        -----------
        score: float, bm25 score.
        """
        # YOUR CODE HERE
        a_scorer = {}
        top_N_scores = get_topN_score_for_queries(queries,self.index,N)
        for query_num, doc_id_scores in top_N_scores.items():
          cand_docs = get_candidate_documents_and_scores(queries[query_num],self.index,self.words,self.pls)
          cand_docs_id = [num for num, term in cand_docs.keys()]
          a_scorer[query_num] = list()
          for doc_id_sc in doc_id_scores:
            if doc_id_sc[0] in cand_docs_id:
              doc_id = doc_id_sc[0]
              score = self._score(queries[query_num], doc_id)
              if query_num in a_scorer:
                a_scorer[query_num].append((doc_id, score))
              else:
                a_scorer[query_num].append((doc_id, score))
          a_scorer[query_num].sort(key= lambda x: x[1], reverse=True)


        return a_scorer

    def _score(self, query, doc_id):
        """
        This function calculate the bm25 score for given query and document.
        
        Parameters:
        -----------
        query: list of token representing the query. For example: ['look', 'blue', 'sky']
        doc_id: integer, document id.
        
        Returns:
        -----------
        score: float, bm25 score.
        """        
        score = 0.0        
        doc_len = DL[str(doc_id)]        
        
        self.idf = self.calc_idf(query)

        for term in query:
            if term in self.index.term_total.keys():                
                
                term_frequencies = dict(self.pls[self.words.index(term)])                
                if doc_id in term_frequencies.keys():            
                    freq = term_frequencies[doc_id]
                    numerator = self.idf[term] * freq * (self.k1 + 1)
                    denominator = freq + self.k1 * (1 - self.b + self.b * doc_len / self.AVGDL)
                    score += (numerator / denominator)
        return score

The below cells represent tests! for later on we need to reallocate them

In [None]:
# idx_title = InvertedIndex.read_index('title_index', 'title')
# idx_body = InvertedIndex.read_index('body_index', 'body')
# # read posting lists from disk
# words, pls = zip(*idx_title.posting_lists_iter())

In [None]:
# bm25_title = BM25_from_index(idx_title)
# bm25_queries_score_train = bm25_title.search(cran_txt_query_text_train)

#### TF-IDF for carnfield data 



Bellow cells contain the following functions: 

*   *generate_query_tfidf_vector* - Generate a vector representing the query
*   *get_candidate_documents_and_scores* - Generate a dictionary representing a pool of candidate documents for a given query.
*   *generate_document_tfidf_matrix* - Generate a DataFrame of tfidf scores for a given query.
*   *cosine_similarity* - Calculate the cosine similarity for each candidate document in D and a given query (e.g., Q).

*   *get_top_n* - Sort and return the highest N documents according to the cosine similarity score.

*   *get_topN_score_for_queries* - Generate a dictionary that gather for every query its topN score.

In [None]:
def generate_query_tfidf_vector(query_to_search,index):
    """ 
    Generate a vector representing the query. Each entry within this vector represents a tfidf score.
    The terms representing the query will be the unique terms in the index.

    We will use tfidf on the query as well. 
    For calculation of IDF, use log with base 10.
    tf will be normalized based on the length of the query.    

    Parameters:
    -----------
    query_to_search: list of tokens (str). This list will be preprocessed in advance (e.g., lower case, filtering stopwords, etc.'). 
                     Example: 'Hello, I love information retrival' --->  ['hello','love','information','retrieval']

    index:           inverted index loaded from the corresponding files.    
    
    Returns:
    -----------
    vectorized query with tfidf scores
    """
    
    epsilon = .0000001
    total_vocab_size = len(index.term_total)
    Q = np.zeros((total_vocab_size))
    term_vector = list(index.term_total.keys())    
    counter = Counter(query_to_search)
    for token in np.unique(query_to_search):
        if token in index.term_total.keys(): #avoid terms that do not appear in the index.               
            tf = counter[token]/len(query_to_search) # term frequency divded by the length of the query
            df = index.df[token]            
            idf = math.log((len(DL))/(df+epsilon),10) #smoothing
            
            try:
                ind = term_vector.index(token)
                Q[ind] = tf*idf                    
            except:
                pass
    return Q

In [None]:
def get_posting_iter(index):
    """
    This function returning the iterator working with posting list.
    
    Parameters:
    ----------
    index: inverted index    
    """
    words, pls = zip(*index.posting_lists_iter())
    return words,pls

In [None]:
def get_candidate_documents_and_scores(query_to_search,index,words,pls):
    """
    Generate a dictionary representing a pool of candidate documents for a given query. This function will go through every token in query_to_search
    and fetch the corresponding information (e.g., term frequency, document frequency, etc.') needed to calculate TF-IDF from the posting list.
    Then it will populate the dictionary 'candidates.'
    For calculation of IDF, use log with base 10.
    tf will be normalized based on the length of the document.
    
    Parameters:
    -----------
    query_to_search: list of tokens (str). This list will be preprocessed in advance (e.g., lower case, filtering stopwords, etc.'). 
                     Example: 'Hello, I love information retrival' --->  ['hello','love','information','retrieval']

    index:           inverted index loaded from the corresponding files.

    words,pls: iterator for working with posting.
    
    Returns:
    -----------
    dictionary of candidates. In the following format:
                                                               key: pair (doc_id,term)
                                                               value: tfidf score. 
    """
    candidates = {}
    for term in np.unique(query_to_search):
        if term in words:            
            list_of_doc = pls[words.index(term)]            
            normlized_tfidf = [(doc_id,(freq/DL[str(doc_id)])*math.log(len(DL)/index.df[term],10)) for doc_id, freq in list_of_doc]
            
            for doc_id, tfidf in normlized_tfidf:
                candidates[(doc_id,term)] = candidates.get((doc_id,term), 0) + tfidf               

    return candidates

In [None]:
def generate_document_tfidf_matrix(query_to_search,index,words,pls):
    """
    Generate a DataFrame `D` of tfidf scores for a given query. 
    Rows will be the documents candidates for a given query
    Columns will be the unique terms in the index.
    The value for a given document and term will be its tfidf score.
    
    Parameters:
    -----------
    query_to_search: list of tokens (str). This list will be preprocessed in advance (e.g., lower case, filtering stopwords, etc.'). 
                     Example: 'Hello, I love information retrival' --->  ['hello','love','information','retrieval']

    index:           inverted index loaded from the corresponding files.

    
    words,pls: iterator for working with posting.

    Returns:
    -----------
    DataFrame of tfidf scores.
    """
    
    total_vocab_size = len(index.term_total)
    candidates_scores = get_candidate_documents_and_scores(query_to_search,index,words,pls) #We do not need to utilize all document. Only the docuemnts which have corrspoinding terms with the query.
    unique_candidates = np.unique([doc_id for doc_id, freq in candidates_scores.keys()])
    D = np.zeros((len(unique_candidates), total_vocab_size))
    D = pd.DataFrame(D)
    
    D.index = unique_candidates
    D.columns = index.term_total.keys()

    for key in candidates_scores:
        tfidf = candidates_scores[key]
        doc_id, term = key    
        D.loc[doc_id][term] = tfidf

    return D

`cosine_similarity` -  This function calculate the cosine similarity for each candidate document in D and a given query (e.g., Q) and return a dictionary of cosine similary scores.

In [None]:
def cosine_similarity(D,Q):
    """
    Calculate the cosine similarity for each candidate document in D and a given query (e.g., Q).
    Generate a dictionary of cosine similarity scores 
    key: doc_id
    value: cosine similarity score
    
    Parameters:
    -----------
    D: DataFrame of tfidf scores.

    Q: vectorized query with tfidf scores
    
    Returns:
    -----------
    dictionary of cosine similarity score as follows:
                                                                key: document id (e.g., doc_id)
                                                                value: cosine similarty score.
    """
    cos_sim_scores_dict = {}
    
    for cand_doc in D.T.iteritems():

      numerator = np.dot(cand_doc[1], Q)
    
      denominator_for_D = np.power(cand_doc[1], 2)
      denominator_for_D = np.sqrt(np.sum(denominator_for_D))
  
      denominator_for_Q = np.power(Q, 2)
      denominator_for_Q = np.sqrt(np.sum(denominator_for_Q))

      denominator = denominator_for_D * denominator_for_Q
      cos_sim_score = (numerator / denominator)

      cos_sim_scores_dict[cand_doc[0]] = cos_sim_score

    return cos_sim_scores_dict

In [None]:
def get_top_n(sim_dict,N=3):
    """ 
    Sort and return the highest N documents according to the cosine similarity score.
    Generate a dictionary of cosine similarity scores 
   
    Parameters:
    -----------
    sim_dict: a dictionary of similarity score as follows:
                                                                key: document id (e.g., doc_id)
                                                                value: similarity score. We keep up to 5 digits after the decimal point. (e.g., round(score,5))

    N: Integer (how many documents to retrieve). By default N = 3
    
    Returns:
    -----------
    a ranked list of pairs (doc_id, score) in the length of N.
    """
    
    return sorted([(doc_id,round(score,5)) for doc_id, score in sim_dict.items()], key = lambda x: x[1],reverse=True)[:N]

`get_topN_score_for_queries` - This function generate a dictionary that gather for every query its topN score, based on cosine similarity.

In [None]:
def get_topN_score_for_queries(queries_to_search,index,N=3):
    """
    Generate a dictionary that gathers for every query its topN score.
    
    Parameters:
    -----------
    queries_to_search: a dictionary of queries as follows: 
                                                        key: query_id
                                                        value: list of tokens.
    index:           inverted index loaded from the corresponding files.    
    N: Integer. How many documents to retrieve. This argument is passed to the topN function. By default N = 3, for the topN function. 
    
    Returns:
    -----------
    return: a dictionary of queries and topN pairs as follows:
                                                        key: query_id
                                                        value: list of pairs in the following format:(doc_id, score). 
    """

    queries_topN_candidates_docs = {}
    for query, tokens_of_query in queries_to_search.items():

      vectorized_query_tfidf = generate_query_tfidf_vector(tokens_of_query, index)

      words, pls = get_posting_iter(index)

      tfidf_docs_per_query_matrix = generate_document_tfidf_matrix(tokens_of_query, index, words, pls)

      cosine_similarity_matrix = cosine_similarity(tfidf_docs_per_query_matrix, vectorized_query_tfidf)

      if N > len(cosine_similarity_matrix): N = len(cosine_similarity_matrix)
      top_N_docs_per_query = get_top_n(cosine_similarity_matrix, N)
      
      queries_topN_candidates_docs[query] = top_N_docs_per_query

    return queries_topN_candidates_docs

this below cell is for tests! later on we will reallocate this to the test class

In [None]:
# # this is for us to understand better how to do testings over our cosine similarity
# tfidf_queries_score_train = get_topN_score_for_queries(cran_txt_query_text_train,idx_title)

this below cell is for tests! later on we will reallocate this to the test class

For query 172 we can observe two document with cosine similarity score of 1. Let's have a glance on this query and documents for making sure it makes sense

In [None]:
# print('relevnt documents and tfidf score for query number 172 :',tfidf_queries_score_train[172])
# print('query: ' ,cran_txt_query_text_train[172])
# print('docuemnt 320: ', cran_txt_data_titles['320'])
# print('docuemnt 320: ', cran_txt_data_titles['321'])
# print('docuemnt 322: ', cran_txt_data_titles['322'])

relevnt documents and tfidf score for query number 172 : [(320, 1.0), (321, 1.0), (322, 1.0)]
query:  ['solution', 'blasius', 'problem', 'three-point', 'boundary', 'conditions']
docuemnt 320:  ['comment', 'improved', 'numerical', 'solution', 'blasius', 'problem', 'three-point', 'boundary', 'conditions']
docuemnt 320:  ['improved', 'numerical', 'solution', 'blasius', 'problem', 'three-point', 'boundary', 'conditions']
docuemnt 322:  ['numerical', 'solution', 'blasius', 'problem', 'three-point', 'boundary', 'conditions']


## Task 3: Using weights of title and body scores

Now we will experiment with two sets of results. 
The first corresponds to results from the title index. 
The second corresponds to the results from the body index.

We need to merge them into a single result set.

`merge_results` - 
This function merge and sort documents retrieved by its weighte score (e.g., title and body).


In [None]:
def merge_results(title_scores,body_scores,title_weight=0.5,text_weight=0.5,N = 3):    
    """
    This function merge and sort documents retrieved by its weighte score (e.g., title and body). 

    Parameters:
    -----------
    title_scores: a dictionary build upon the title index of queries and tuples representing scores as follows: 
                                                                            key: query_id
                                                                            value: list of pairs in the following format:(doc_id,score)
                
    body_scores: a dictionary build upon the body/text index of queries and tuples representing scores as follows: 
                                                                            key: query_id
                                                                            value: list of pairs in the following format:(doc_id,score)
    title_weight: float, for weigted average utilizing title and body scores
    text_weight: float, for weigted average utilizing title and body scores
    N: Integer. How many document to retrieve. This argument is passed to topN function. By default N = 3, for the topN function. 
    
    Returns:
    -----------
    dictionary of querires and topN pairs as follows:
                                                        key: query_id
                                                        value: list of pairs in the following format:(doc_id,score). 
    """
    # YOUR CODE HERE
    finale = {}
    for (key_title, doc_term_list_title), (key_body, doc_term_list_body) in zip(title_scores.items(), body_scores.items()):

      finale[key_title] = merge_tuples((key_title, doc_term_list_title), (key_body, doc_term_list_body), title_weight, text_weight)[:N]

    return finale




def merge_tuples(tuple_title, tuple_body, t_weight, b_weight):
    temporal_dict = {}
    list_of_tuples_title = tuple_title[1]
    list_of_tuples_body = tuple_body[1]
    
    size_title = len(list_of_tuples_title)
    size_body = len(list_of_tuples_body)

    general = []
    i, j, position_in_list = 0, 0, 0

    while i < size_title and j < size_body:
        if list_of_tuples_title[i][1] > list_of_tuples_body[j][1]:
            # before_t_weighting = (list_of_tuples_title[i][0], list_of_tuples_title[i][1])
            # general.append(before_t_weighting)
            if list_of_tuples_title[i][0] in temporal_dict:
                temporal_dict[list_of_tuples_title[i][0]] = (temporal_dict[list_of_tuples_title[i][0]][0]+1, temporal_dict[list_of_tuples_title[i][0]][1])
            else:
                temporal_dict[list_of_tuples_title[i][0]] = (1, position_in_list)
                position_in_list += 1

            # temporal_dict[list_of_tuples_body[i][0]][1] == (integer, position_in_list)[0] == integer
            if temporal_dict[list_of_tuples_title[i][0]][0] > 1:
                first = general[temporal_dict[list_of_tuples_title[i][0]][1]][0]
                updated_score = general[temporal_dict[list_of_tuples_title[i][0]][1]][1] + list_of_tuples_title[i][1] * t_weight
                general[temporal_dict[list_of_tuples_title[i][0]][1]] = (first, updated_score)
            else:
                after_t_weighting = (list_of_tuples_title[i][0], list_of_tuples_title[i][1] * t_weight)
                general.append(after_t_weighting)

            i += 1
            

        else:

            if list_of_tuples_body[j][0] in temporal_dict:
                temporal_dict[list_of_tuples_body[j][0]] = (temporal_dict[list_of_tuples_body[j][0]][0] + 1, temporal_dict[list_of_tuples_body[j][0]][1])
            else:
                temporal_dict[list_of_tuples_body[j][0]] = (1, position_in_list)
                position_in_list += 1

            # temporal_dict[list_of_tuples_body[i][0]][1] == (integer, position_in_list)[0] == integer
            if temporal_dict[list_of_tuples_body[j][0]][0] > 1:

                first = general[temporal_dict[list_of_tuples_body[j][0]][1]][0]
                updated_score = general[temporal_dict[list_of_tuples_body[j][0]][1]][1] + list_of_tuples_body[j][1] * b_weight
                general[temporal_dict[list_of_tuples_body[j][0]][1]] = (first, updated_score)

            else:
                after_b_weighting = (list_of_tuples_body[j][0], list_of_tuples_body[j][1] * b_weight)
                general.append(after_b_weighting)

            j += 1
            


    if i >= size_title:
        # this while loop append all the rest of the elements that didnwt append to the list in the first iterations
        while j < size_body:
            if list_of_tuples_body[j][0] in temporal_dict:
                temporal_dict[list_of_tuples_body[j][0]] = (temporal_dict[list_of_tuples_body[j][0]][0] + 1, temporal_dict[list_of_tuples_body[j][0]][1])
            else:
                temporal_dict[list_of_tuples_body[j][0]] = (1, position_in_list)
                position_in_list += 1

            # temporal_dict[list_of_tuples_body[i][0]][1] == (integer, position_in_list)[0] == integer
            if temporal_dict[list_of_tuples_body[j][0]][0] > 1:
                first = general[temporal_dict[list_of_tuples_body[j][0]][1]][0]
                updated_score = general[temporal_dict[list_of_tuples_body[j][0]][1]][1] + list_of_tuples_body[j][1] * b_weight
                # general[temporal_dict[list_of_tuples_body[i][0]][1]]
                general[temporal_dict[list_of_tuples_body[j][0]][1]] = (first, updated_score)
            else:
                after_b_weighting = (list_of_tuples_body[j][0], list_of_tuples_body[j][1] * b_weight)
                general.append(after_b_weighting)

            j += 1
            

    elif j >= size_body:
        # this while loop append all the rest of the elements that didnwt append to the list in the first iterations
        while i < size_title:
            if list_of_tuples_title[i][0] in temporal_dict:
                temporal_dict[list_of_tuples_title[i][0]] = (temporal_dict[list_of_tuples_title[i][0]][0]+1, temporal_dict[list_of_tuples_title[i][0]][1])
            else:
                temporal_dict[list_of_tuples_title[i][0]] = (1, position_in_list)
                position_in_list += 1

            # temporal_dict[list_of_tuples_body[i][0]][1] == (integer, position_in_list)[0] == integer
            if temporal_dict[list_of_tuples_title[i][0]][0] > 1:
                first = general[temporal_dict[list_of_tuples_title[i][0]][1]][0]
                updated_score = general[temporal_dict[list_of_tuples_title[i][0]][1]][1] + list_of_tuples_title[i][1] * t_weight
                general[temporal_dict[list_of_tuples_title[i][0]][1]] = (first, updated_score)
            else:
                after_t_weighting = (list_of_tuples_title[i][0], list_of_tuples_title[i][1]*t_weight)
                general.append(after_t_weighting)

            i += 1
    
    general.sort(key=lambda x: x[1], reverse=True)
    return general

The below cells represent configuration before tests! for later on we need to reallocate them

In [None]:
# #Do Not run this cell for nothing!
# bm25_title = BM25_from_index(idx_title)
# bm25_body = BM25_from_index(idx_body)

# bm25_queries_score_train_title = bm25_title.search(cran_txt_query_text_train)
# bm25_queries_score_train_body = bm25_body.search(cran_txt_query_text_train)

The below cells represent tests! for later on we need to reallocate them

In [None]:
# #tests
# w1,w2 = 0.5, 0.5
# w3,w4 = 0.25,0.75

# half_and_half = merge_results(bm25_queries_score_train_title,bm25_queries_score_train_body,w1,w2)        
# assert len(half_and_half[2]) == 3
# assert type(half_and_half) == dict
# assert type(half_and_half[2]) == list
# assert len(half_and_half) == 180
# assert half_and_half[2][0][1] == 0.5 * (bm25_queries_score_train_title[2][-1][1]+ bm25_queries_score_train_body[2][0][1])

# quarter_and_three_quarters = merge_results(bm25_queries_score_train_title,bm25_queries_score_train_body,0.25,0.75)        

# assert quarter_and_three_quarters[2][0][1] == (w3 * bm25_queries_score_train_title[2][-1][1] + w4 * bm25_queries_score_train_body[2][0][1])
# assert {k for k,v in half_and_half[16]} != {k for k,v in quarter_and_three_quarters[16]}
# assert len({k for k,v in half_and_half[16]}.union({k for k,v in quarter_and_three_quarters[16]})) < (len({k for k,v in half_and_half[16]}) + len({k for k,v in quarter_and_three_quarters[16]}))

Here we provided three examples of mistakes that the model is making and explanations for why, and describe how we will change the model based on these observations.

In [None]:
# YOUR CODE HERE
# 1. Relevance: One mistake that the model might make is returning documents that are not relevant to the query.
#    This could be due to the model not accurately understanding the meaning of the query or the content of the documents. 
#    To improve the model's performance, I can consider using techniques such as query expansion or document summarization to better understand the context and meaning of the query and documents
# 2. Precision: Another mistake that the model might make is returning too many documents that are not relevant to the query,
#    which can reduce the precision of the search results.
#    To improve precision,we can use techniques such as relevance feedback or query refinement to help the model better understand the user's intent.
# 3. Recall: Our model might also make the mistake of not returning all of the relevant documents for a given query,
#    which can reduce the recall of the search results.
#    To improve recall, I can consider using techniques such as pseudorelevance feedback or expanding the search to include additional sources of information. 