In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
from ast import literal_eval
from math import log

from IPython.display import Markdown, display, HTML
import pandas as pd

## Load Data

In [3]:
inv_index = pd.read_csv("../output/inverted_index.csv").set_index(['word'])
inv_index["doc_id:freq"] = inv_index["doc_id:freq"].apply(lambda x: literal_eval(x))

display(Markdown("## Inverted Index"))
display(HTML(inv_index.head(5).to_html()))

## Inverted Index

Unnamed: 0_level_0,doc_id:freq
word,Unnamed: 1_level_1
juíza,"[(0, 2), (1, 1)]"
federal,"[(0, 1), (1, 1), (2, 1), (6, 2), (14, 1), (36,..."
Ivani,"[(0, 1), (1, 1)]"
Silva,"[(0, 3), (1, 1), (5, 1), (13, 2), (25, 1), (72..."
Luz,"[(0, 3), (1, 1), (124, 1)]"


### Adding Inverse Document Frequency  (IDF)

In [4]:
# number of documents in the collection
n_docs = inv_index["doc_id:freq"].apply(lambda x: list(i[0] for i in x)).sum()
n_docs = len(set(n_docs))
display(Markdown("* Let's employ the number  of documents in this particular \
                  collection (N={}) in our calculations".format(n_docs)))

* Let's employ the number  of documents in this particular                   collection (N=249) in our calculations

In [5]:
inv_index["IDF"] = inv_index["doc_id:freq"].apply(lambda x: log((n_docs + 1)/len(x)))
inv_index.sample(5)

Unnamed: 0_level_0,doc_id:freq,IDF
word,Unnamed: 1_level_1,Unnamed: 2_level_1
Gente,"[(188, 1)]",5.521461
satisfação,"[(62, 1)]",5.521461
paralelos,"[(24, 1), (162, 1)]",4.828314
serviria,"[(55, 1)]",5.521461
PSG,"[(53, 1), (117, 1)]",4.828314


<br>

***

<br>

## Strategies Implementation

### Binary Vector Space Model

In [6]:
def binary_vsm(**kwargs):
    """Applies the binary vsm to a query and document

    Applies the 'binary vector space model' to a query 
    and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]

    result = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()["doc_id:freq"]
    result = result.apply(lambda x: 0 if not list(w for w in x if w[0] == doc_id) else 1)
    result = result.sum()
    
    return result

### Term Frequency Vector Space Model

In [7]:
def tf_vsm(**kwargs):
    """Applies the tf vsm to a query and document

    Applies the 'term frequency vector space model' to a query 
    and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    
    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()["doc_id:freq"]\
                      .apply(lambda x: list(w for w in x if w[0] == doc_id))\
                      .apply(lambda x: x[0][1] if not (len(x) == 0) else 0) # Extract freq 

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = None

        # try catch for when a term doesn't match at all
        try:
            doc_score = term_f_in_doc.xs(term)
        except KeyError as exc:
            doc_score = 0

        score += query_score * doc_score

    return score

### Term Frequency - Inverse Document Frequency Vector Space Model

In [8]:
def tf_idf_vsm(**kwargs):
    """Applies the tf-idf vsm to a query and document

    Applies the 'term frequency vector inverse document frequency
    space model' to a query and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """    
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = None
        idf_score = None

        # try catch for when a term doesn't match at all
        try:
            doc_score = term_f_in_doc.xs(term)['doc_id:freq']
        except KeyError as exc:
            doc_score = 0

        try:
            idf_score = term_f_in_doc.xs(term)['IDF']
        except KeyError as exc:
            idf_score = 0

        score += query_score * doc_score * idf_score
        
    return score

### Best Match 25 Vector Space Model

In [9]:
def bm25_vsm(**kwargs):
    """Applies the bm25 vsm to a query and document

    Applies the 'Best Matching 25 space model' to a query 
    and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    :param int k: term frequency saturation factor
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """    
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    k = kwargs["k"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = None
        idf_score = None

        # try catch for when a term doesn't match at all
        try:
            doc_count = term_f_in_doc.xs(term)['doc_id:freq']
            doc_score = 0 if doc_count == 0 else (doc_count * (k + 1)) / (doc_count + k)
        except KeyError as exc:
            doc_score = 0
            
        try:
            idf_score = term_f_in_doc.xs(term)['IDF']
        except KeyError as exc:
            idf_score = 0
        
        score += query_score * doc_score * idf_score

    return score

### Strategy Wrapper

In [10]:
from bisect import insort_left


def get_n_best_docs(**kwargs):
    """Retrieves n most relevant docs according to a strategy

    Retrieves the n most relevant documents in a given collection
    according to a given strategy.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param list(int) docs: All documents to be evaluated 
                           (extracted from index if not specified).
    :param function strategy: vsm strategy (its params must be 
                              supplied as well)
    
    :return: list of document ids by rank in decrescent order

    :rtype: list((number,number))
    """   

    n = kwargs.pop("n")
    strategy = kwargs.pop("strategy")
    
    if "docs" in kwargs:
        docs = kwargs.pop("docs")
    else:        
        docs = index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
        docs = list(set(docs))
        
    rank = []
    for doc in docs:
        score = strategy(doc_id=doc,**kwargs)
        insort_left(rank, (score,doc))
        rank = rank[-n:]

    return rank

## Strategy Evaluation

In [11]:
source = pd.read_csv("../output/results.csv")
display(Markdown("##### Inverted Index's source data"))
source["title"].head()

##### Inverted Index's source data

0    “A sociedade foi Rubens Paiva  não os facínora...
1    Justiça suspende decisão que proibia Forças Ar...
2    Governo Bolsonaro prega “negacionismo históric...
3    Quando os pais de Gabo perceberam que tinham u...
4    Rádios canadenses banem músicas de Michael Jac...
Name: title, dtype: object

Let's gather the id of all documents to supply the algorithms:

In [12]:
all_docs = inv_index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
all_docs = list(set(all_docs))
pd.DataFrame(all_docs, columns=["docs"]).describe()

Unnamed: 0,docs
count,249.0
mean,124.0
std,72.024301
min,0.0
25%,62.0
50%,124.0
75%,186.0
max,248.0


### Reciprocal Rank

In [40]:
def reciprocal_rank(tgt_doc, retrieved_docs):
    found = False
    recip_rank = None

    for rank, doc in enumerate(retrieved_docs,1): 
        if doc == tgt_doc:
            found = True
            recip_rank = 1 / rank 

    if not found:
        recip_rank = 0

    return recip_rank

In [13]:
doc_id = 111

display(Markdown("##### Target Document (ID={})".format(doc_id)))
source.xs(doc_id)

##### Target Document (ID=111)

title       Idoso é preso por ajudar esposa a fazer eutaná...
subtitle    O espanhol Ángel Hernández atendeu ao desejo d...
author                                       Emilio de Benito
date                                      04/04/2019 14:39:50
section                                         Internacional
text        Ángel Hernández e María José Carrasco estavam ...
url         https://brasil.elpais.com/brasil/2019/04/04/in...
Name: 111, dtype: object

In [14]:
query = ['suicídio','eutanásia','Espanha']
display(Markdown("##### The query used to retrieve the target document:".format(doc_id)))
display(Markdown(" * {}".format(query)))

##### The query used to retrieve the target document:

 * ['suicídio', 'eutanásia', 'Espanha']

> We shall use the classic value of k=1.2 for the algorithm BM25 

In [23]:
strategies = [binary_vsm,tf_vsm,tf_idf_vsm,bm25_vsm]
columns = ["Binary","TF","TF-IDF","BM25(k=1.2)"]
k_values =[None,None,None,1.2]
rows = []

for idx in range(0,len(strategies)):
    top_10 = get_n_best_docs(n=n_docs, k=k_values[idx], docs=all_docs,index=inv_index, query=query,
                             strategy=strategies[idx])
    
    top_10 = list(reversed(top_10))
    top_10 = [i[1] for i in top_10]
    
    rows.append(top_10)

docs_q1 = pd.DataFrame(list(zip(*rows)),columns=columns)
display(Markdown("##### Top 5 best docs by strategy for given query"))
display(HTML(docs_q1.head().to_html(index=False)))

##### Top 5 best docs by strategy for given query

Binary,TF,TF-IDF,BM25(k=1.2)
111,248,248,111
248,68,111,222
247,122,222,79
244,128,79,237
242,111,68,195


In [62]:
recip_rank_scores = list(map(lambda score: [score],
                         map(lambda r_docs: reciprocal_rank(111, r_docs),rows)))

docs_q1 = pd.DataFrame(list(zip(*recip_rank_scores)),columns=columns)
display(Markdown("##### Reciprocal Rank score by strategy for query"))
display(HTML(docs_q1.head().to_html(index=False)))

##### Reciprocal Rank score by strategy for query

Binary,TF,TF-IDF,BM25(k=1.2)
1.0,0.2,0.5,1.0


In [70]:
import json


answer_key = None

with open('../output/results_final.json') as json_file:  
    answer_key = json.load(json_file)