In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
from functools import reduce
from ast import literal_eval
from math import log

import pandas as pd
from numpy import mean
from nltk import word_tokenize
from nltk.corpus import stopwords
from IPython.display import Markdown, display, HTML

default_stopwords = set(stopwords.words('portuguese'))

## Load Data

In [3]:
inv_index = pd.read_csv("../output/inverted_index.csv").set_index(['word'])
inv_index["doc_id:freq"] = inv_index["doc_id:freq"].apply(lambda x: literal_eval(x))

display(Markdown("## Inverted Index"))
display(HTML(inv_index.head(5).to_html()))

## Inverted Index

Unnamed: 0_level_0,doc_id:freq
word,Unnamed: 1_level_1
juíza,"[(0, 2), (1, 1)]"
federal,"[(0, 1), (1, 1), (2, 1), (6, 2), (14, 1), (36,..."
Ivani,"[(0, 1), (1, 1)]"
Silva,"[(0, 3), (1, 1), (5, 1), (13, 2), (25, 1), (72..."
Luz,"[(0, 3), (1, 1), (124, 1)]"


### Adding Inverse Document Frequency  (IDF)

In [4]:
# number of documents in the collection
n_docs = inv_index["doc_id:freq"].apply(lambda x: list(i[0] for i in x)).sum()
n_docs = len(set(n_docs))
display(Markdown("* Let's employ the number  of documents in this particular \
                  collection (N={}) in our calculations".format(n_docs)))

* Let's employ the number  of documents in this particular                   collection (N=249) in our calculations

In [5]:
inv_index["IDF"] = inv_index["doc_id:freq"].apply(lambda x: log((n_docs + 1)/len(x)))
inv_index.sample(5)

Unnamed: 0_level_0,doc_id:freq,IDF
word,Unnamed: 1_level_1,Unnamed: 2_level_1
Beija-Flor,"[(6, 3)]",5.521461
provocativo,"[(107, 1)]",5.521461
petrolíferos,"[(248, 1)]",5.521461
tumba,"[(153, 1), (183, 1)]",4.828314
Fabiana,"[(82, 1)]",5.521461


<br>

***

<br>

## Strategies Implementation

### Binary Vector Space Model

In [6]:
def binary_vsm(**kwargs):
    """Applies the binary vsm to a query and document

    Applies the 'binary vector space model' to a query 
    and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]

    result = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()["doc_id:freq"]
    result = result.apply(lambda x: 0 if not list(w for w in x if w[0] == doc_id) else 1)
    result = result.sum()
    
    return result

### Term Frequency Vector Space Model

In [7]:
def tf_vsm(**kwargs):
    """Applies the tf vsm to a query and document

    Applies the 'term frequency vector space model' to a query 
    and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    
    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()["doc_id:freq"]\
                      .apply(lambda x: list(w for w in x if w[0] == doc_id))\
                      .apply(lambda x: x[0][1] if not (len(x) == 0) else 0) # Extract freq 

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = None

        # try catch for when a term doesn't match at all
        try:
            doc_score = term_f_in_doc.xs(term)
        except KeyError as exc:
            doc_score = 0

        score += query_score * doc_score

    return score

### Term Frequency - Inverse Document Frequency Vector Space Model

In [8]:
def tf_idf_vsm(**kwargs):
    """Applies the tf-idf vsm to a query and document

    Applies the 'term frequency vector inverse document frequency
    space model' to a query and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """    
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = None
        idf_score = None

        # try catch for when a term doesn't match at all
        try:
            doc_score = term_f_in_doc.xs(term)['doc_id:freq']
        except KeyError as exc:
            doc_score = 0

        try:
            idf_score = term_f_in_doc.xs(term)['IDF']
        except KeyError as exc:
            idf_score = 0

        score += query_score * doc_score * idf_score
        
    return score

### Best Match 25 Vector Space Model

In [9]:
def bm25_vsm(**kwargs):
    """Applies the bm25 vsm to a query and document

    Applies the 'Best Matching 25 space model' to a query 
    and document, calculating a score.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param int doc_id: id of the document.
    :param int k: term frequency saturation factor
    
    :return: relevance score, how relevant the document is 
             for that query

    :rtype: number
    """    
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    k = kwargs["k"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').reindex(labels=set(query)).dropna()
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = None
        idf_score = None

        # try catch for when a term doesn't match at all
        try:
            doc_count = term_f_in_doc.xs(term)['doc_id:freq']
            doc_score = 0 if doc_count == 0 else (doc_count * (k + 1)) / (doc_count + k)
        except KeyError as exc:
            doc_score = 0
            
        try:
            idf_score = term_f_in_doc.xs(term)['IDF']
        except KeyError as exc:
            idf_score = 0
        
        score += query_score * doc_score * idf_score

    return score

### Strategy Wrapper

In [10]:
from bisect import insort_left


def get_n_best_docs(**kwargs):
    """Retrieves n most relevant docs according to a strategy

    Retrieves the n most relevant documents in a given collection
    according to a given strategy.

    :param pandas.core.frame.DataFrame index: inverted index.
    :param list(str) query: list of terms that form a query.
    :param list(int) docs: All documents to be evaluated 
                           (extracted from index if not specified).
    :param function strategy: vsm strategy (its params must be 
                              supplied as well)
    
    :return: list of document ids by rank in decrescent order

    :rtype: list((number,number))
    """   

    n = kwargs.pop("n")
    strategy = kwargs.pop("strategy")
    
    if "docs" in kwargs:
        docs = kwargs.pop("docs")
    else:        
        docs = index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
        docs = list(set(docs))
        
    rank = []
    for doc in docs:
        score = strategy(doc_id=doc,**kwargs)
        insort_left(rank, (score,doc))
        rank = rank[-n:]

    return rank

## Strategy Evaluation

In [11]:
source = pd.read_csv("../output/results.csv")
display(Markdown("##### Inverted Index's source data"))
source["title"].head()

##### Inverted Index's source data

0    “A sociedade foi Rubens Paiva  não os facínora...
1    Justiça suspende decisão que proibia Forças Ar...
2    Governo Bolsonaro prega “negacionismo históric...
3    Quando os pais de Gabo perceberam que tinham u...
4    Rádios canadenses banem músicas de Michael Jac...
Name: title, dtype: object

Let's gather the id of all documents to supply the algorithms:

In [12]:
all_docs = inv_index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
all_docs = list(set(all_docs))
pd.DataFrame(all_docs, columns=["docs"]).describe()

Unnamed: 0,docs
count,249.0
mean,124.0
std,72.024301
min,0.0
25%,62.0
50%,124.0
75%,186.0
max,248.0


### Reciprocal Rank

In [13]:
def reciprocal_rank(tgt_doc, retrieved_docs):
    found = False
    recip_rank = None

    for rank, doc in enumerate(retrieved_docs,1): 
        if doc == tgt_doc:
            found = True
            recip_rank = 1 / rank 

    if not found:
        recip_rank = 0

    return recip_rank

In [14]:
doc_id = 111

display(Markdown("##### Target Document (ID={})".format(doc_id)))
source.xs(doc_id)

##### Target Document (ID=111)

title       Idoso é preso por ajudar esposa a fazer eutaná...
subtitle    O espanhol Ángel Hernández atendeu ao desejo d...
author                                       Emilio de Benito
date                                      04/04/2019 14:39:50
section                                         Internacional
text        Ángel Hernández e María José Carrasco estavam ...
url         https://brasil.elpais.com/brasil/2019/04/04/in...
Name: 111, dtype: object

In [15]:
query1 = ['suicídio','eutanásia','Espanha']
display(Markdown("##### Query used to retrieve the target document (Query 1):".format(doc_id)))
display(Markdown(" * {}".format(query1)))

##### Query used to retrieve the target document (Query 1):

 * ['suicídio', 'eutanásia', 'Espanha']

> We shall use the classic value of k=1.2 for the algorithm BM25 

In [16]:
strategies = [binary_vsm,tf_vsm,tf_idf_vsm,bm25_vsm]
columns = ["Binary","TF","TF-IDF","BM25(k=1.2)"]
k_values =[None,None,None,1.2]
rows = []

for idx in range(0,len(strategies)):
    top_10 = get_n_best_docs(n=n_docs, k=k_values[idx], docs=all_docs,index=inv_index, query=query1,
                             strategy=strategies[idx])
    
    top_10 = list(reversed(top_10))
    top_10 = [i[1] for i in top_10]
    
    rows.append(top_10)

docs_q1 = pd.DataFrame(list(zip(*rows)),columns=columns)
display(Markdown("##### Top 5 best docs by strategy for query1"))
display(HTML(docs_q1.head().to_html(index=False)))

##### Top 5 best docs by strategy for query1

Binary,TF,TF-IDF,BM25(k=1.2)
111,248,248,111
248,68,111,222
247,122,222,79
244,128,79,237
242,111,68,195


In [17]:
recip_rank_scores = list(map(lambda score: [score],
                         map(lambda r_docs: reciprocal_rank(111, r_docs),rows)))

docs_q1 = pd.DataFrame(list(zip(*recip_rank_scores)),columns=columns)
display(Markdown("##### Reciprocal Rank score by strategy for query1"))
display(HTML(docs_q1.head().to_html(index=False)))

##### Reciprocal Rank score by strategy for query1

Binary,TF,TF-IDF,BM25(k=1.2)
1.0,0.2,0.5,1.0


## Query Processement and Document Retrieval

#### Answer Key

In [18]:
import json


answer_key = None

with open('../output/results_final.json') as json_file:  
    answer_key = json.load(json_file)

In [19]:
def split_spread_words(corpus, delim):

    """ Split then spread alpha word with certain delimiters.

    Split words with alphabetical characters that have certain 
    delimiters then spread the resulting words across the corpus.

    :param list corpus: list of words.
    :param str delim: target delimiter.

    :return: updated list of words 

    :rtype: list
    """
    
    new_words = []
    for word in corpus:
        if any(c.isalnum() for c in word):
            new_words.extend(word.split(delim))
        else:
            new_words.append(word)

    return new_words

def word_processing(text):
    words = word_tokenize(text)

    # Remove words that don't have at least one alphabetical character 
    words = [word for word in words if any(c.isalnum() for c in word)]

    # Remove hyphen at end of word
    words = [word[:-1] if word[-1] == '-' else word for word in words]

    # Remove hyphen at beggining of word
    words = [word[1:] if word[0] == '-' else word for word in words]

    # Split words joined by en dash
    words = [word for line in words for word in line.split('–')] 
    words = [word for line in words for word in line.split('—')] # different encoding 

    # Split words joined by dot if they are alphabetical
    words = split_spread_words(words, '.')

    # Remove lone punctuation from the splits
    words = [word for word in words if any(c.isalnum() for c in word)]

    # Remove stopwords
    words = [word for word in words if word.lower() not in default_stopwords]
    
    return words

In [20]:
results = []

queries = answer_key['query']
queries = list(map(lambda q: word_processing(q), queries))
gnd_truth = list(map(lambda answ_k: list(el['URL'] for el in answ_k), answer_key['docs']))
gnd_rel = list(map(lambda docs: list({el['URL'] : el['level']} for el in docs), answer_key['docs']))
gnd_rel = list(map(lambda docs: reduce(lambda a, b: dict(a, **b), docs), gnd_rel))

display(Markdown("##### Queries used for MAP driven evaluation:"))
for q in queries:
    display(Markdown("* {}".format(', '.join(q))))

##### Queries used for MAP driven evaluation:

* território, palestino

* recessão, mundial

* ditadura, militar

* muro, lamentações

* brasil, argentina

* golpe, militar

* governo, bolsonaro

* ministro, economia

* prisão, Temer

* Congresso, Nacional

In [21]:
strat = [binary_vsm, tf_vsm, tf_idf_vsm, bm25_vsm]
strat_nm = ["Binary","TF","TF-IDF","BM25(k=1.2)"]
q_rst = {}

# Add results of 10 queries for each strategy  
for idx, strat_impl in enumerate(strat):
    q_rst[strat_nm[idx]] = []
    for q in queries:
        ranked_docs = get_n_best_docs(n=10, docs=all_docs, index=inv_index,
                                      query=q, strategy=strat_impl, k=1.2)
        ranked_docs = list(reversed(ranked_docs))
        ranked_docs = [i[1] for i in ranked_docs]
        ranked_docs = list(map(lambda doc_id: source.xs(doc_id)['url'], ranked_docs))
        q_rst[strat_nm[idx]].append(ranked_docs)

In [22]:
pd.DataFrame.from_dict(q_rst)

Unnamed: 0,Binary,TF,TF-IDF,BM25(k=1.2)
0,[https://brasil.elpais.com/brasil/2019/03/26/i...,[https://brasil.elpais.com/brasil/2019/03/26/i...,[https://brasil.elpais.com/brasil/2019/03/26/i...,[https://brasil.elpais.com/brasil/2019/03/26/i...
1,[https://brasil.elpais.com/brasil/2019/04/02/e...,[https://brasil.elpais.com/brasil/2018/12/28/e...,[https://brasil.elpais.com/brasil/2018/12/28/e...,[https://brasil.elpais.com/brasil/2019/04/02/e...
2,[https://brasil.elpais.com/brasil/2019/04/01/p...,[https://brasil.elpais.com/brasil/2019/03/29/p...,[https://brasil.elpais.com/brasil/2019/03/29/p...,[https://brasil.elpais.com/brasil/2019/03/29/p...
3,[https://brasil.elpais.com/brasil/2019/03/15/o...,[https://brasil.elpais.com/brasil/2019/03/26/i...,[https://brasil.elpais.com/brasil/2019/03/26/i...,[https://brasil.elpais.com/brasil/2019/03/26/i...
4,[https://brasil.elpais.com/brasil/2019/03/29/p...,[https://brasil.elpais.com/brasil/2019/03/28/i...,[https://brasil.elpais.com/brasil/2019/03/28/i...,[https://brasil.elpais.com/brasil/2019/03/28/i...
5,[https://brasil.elpais.com/brasil/2019/04/04/p...,[https://brasil.elpais.com/brasil/2019/03/30/p...,[https://brasil.elpais.com/brasil/2019/03/30/p...,[https://brasil.elpais.com/brasil/2019/03/30/p...
6,[https://brasil.elpais.com/brasil/2019/03/20/p...,[https://brasil.elpais.com/brasil/2019/04/03/o...,[https://brasil.elpais.com/brasil/2019/04/03/o...,[https://brasil.elpais.com/brasil/2019/04/03/o...
7,[https://brasil.elpais.com/brasil/2019/04/03/p...,[https://brasil.elpais.com/brasil/2019/04/03/p...,[https://brasil.elpais.com/brasil/2019/03/21/e...,[https://brasil.elpais.com/brasil/2019/04/03/p...
8,[https://brasil.elpais.com/brasil/2019/04/03/p...,[https://brasil.elpais.com/brasil/2019/03/24/p...,[https://brasil.elpais.com/brasil/2019/03/24/p...,[https://brasil.elpais.com/brasil/2019/03/24/p...
9,[https://brasil.elpais.com/brasil/2019/03/18/d...,[https://brasil.elpais.com/brasil/2019/03/28/p...,[https://brasil.elpais.com/brasil/2019/03/28/p...,[https://brasil.elpais.com/brasil/2019/04/03/p...


### Mean Average Precision 

In [23]:
def avg_prc(gnd_truth, ranked_docs):
    avg_prc = 0
    seen = 0
    for idx, doc in enumerate(ranked_docs,1):
        if doc in gnd_truth:
            seen += 1
            avg_prc += seen/idx

    avg_prc = avg_prc / len(gnd_truth)
    return avg_prc

In [24]:
results = {}
for strategy in strat_nm:
    scores = []
    for i in range(0,10):
        ranked_docs = q_rst[strategy][i]
        score = avg_prc(gnd_truth[i], ranked_docs)
        scores.append(score)
    results[strategy] = scores

In [25]:
display(Markdown("##### Average precision scores by strategy"))
pd.DataFrame.from_dict(results)

##### Average precision scores by strategy

Unnamed: 0,Binary,TF,TF-IDF,BM25(k=1.2)
0,0.0,0.0,0.22619,0.416667
1,0.5,0.25,0.25,0.5
2,0.0,0.0,0.0,0.05
3,0.0,0.0,0.0,0.0
4,0.833333,0.5,0.5,0.5
5,0.047619,0.111111,0.185185,0.206349
6,0.0,0.0,0.0,0.0
7,0.1,0.055556,0.0,0.208333
8,0.325,0.583333,0.583333,0.583333
9,0.0,0.0,0.0,0.0


In [26]:
display(Markdown("### Mean Average Precision (MAP) by strategy"))
for strategy, score in dict((k, mean(v)) for k, v in results.items()).items():
    display(Markdown("* `{}`: {}".format(strategy, score)))

### Mean Average Precision (MAP) by strategy

* `Binary`: 0.1805952380952381

* `TF`: 0.15

* `TF-IDF`: 0.17447089947089947

* `BM25(k=1.2)`: 0.24646825396825395

## Discounted cumulative gain (DCG)

In [27]:
def dcg_score(relvt_docs, retrv_docs):
    score =  relvt_docs[retrv_docs[0]] if retrv_docs[0] in relvt_docs else 0

    for idx, doc in enumerate(retrv_docs[1:],2):
        if doc in relvt_docs:
            score += relvt_docs[doc] / log(idx,2)
    
    return score

In [28]:
results = {}
for strategy in strat_nm:
    scores = []
    for i in range(0,10):
        ranked_docs = q_rst[strategy][i]
        score = dcg_score(gnd_rel[i], ranked_docs)
        scores.append(score)
    results[strategy] = scores

In [29]:
display(Markdown("##### Discounted cumulative gain by strategy"))
pd.DataFrame.from_dict(results)

##### Discounted cumulative gain by strategy

Unnamed: 0,Binary,TF,TF-IDF,BM25(k=1.2)
0,0.0,0.0,4.366423,6.392789
1,9.0,9.0,9.0,9.0
2,0.0,0.0,0.0,2.40824
3,0.0,0.0,0.0,0.0
4,12.678368,11.5,11.5,11.5
5,1.781036,3.154649,5.047438,5.291892
6,0.0,0.0,0.0,0.0
7,3.014736,2.208254,0.0,5.428156
8,5.653383,9.416508,9.416508,9.416508
9,0.0,0.0,0.0,0.0


## Ideal Discounted cumulative gain (IDCG) 

In [73]:
def ideal_dcg_score(relvt_docs, retrv_docs):
    sorted_scores = sorted([relvt_docs[w] for w in relvt_docs if w in retrv_docs],reverse=True)

    score =  sorted_scores[0] if len(sorted_scores) > 0 else 0

    for idx, relvc in enumerate(sorted_scores[1:],2):
        score += relvc / log(idx,2)

    return score

In [74]:
results = {}
for strategy in strat_nm:
    scores = []
    for i in range(0,10):
        ranked_docs = q_rst[strategy][i]
        score = ideal_dcg_score(gnd_rel[i], ranked_docs)
        scores.append(score)
    results[strategy] = scores

In [75]:
display(Markdown("##### Ideal discounted cumulative gain by strategy"))
pd.DataFrame.from_dict(results)

##### Ideal discounted cumulative gain by strategy

Unnamed: 0,Binary,TF,TF-IDF,BM25(k=1.2)
0,0.0,0.0,12.0,12.0
1,9.0,9.0,9.0,9.0
2,0.0,0.0,0.0,8.0
3,0.0,0.0,0.0,0.0
4,16.0,16.0,16.0,16.0
5,5.0,5.0,11.0,11.0
6,0.0,0.0,0.0,0.0
7,7.0,7.0,0.0,15.0
8,12.0,12.0,12.0,12.0
9,0.0,0.0,0.0,0.0
