In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Vector Model

In [2]:
from ast import literal_eval
from math import log

from IPython.display import Markdown, display, HTML
import pandas as pd

## Load Data

In [3]:
inv_index = pd.read_csv("../output/inverted_index.csv").set_index(['word'])\
            .drop('vivoÉ').drop('ônibusFoi') #remove nltk tokenization mistake
inv_index["doc_id:freq"] = inv_index["doc_id:freq"].apply(lambda x: literal_eval(x))
display(Markdown("## Inverted Index"))

inv_index.head(5)

## Inverted Index

Unnamed: 0_level_0,doc_id:freq
word,Unnamed: 1_level_1
juíza,"[(0, 2), (1, 1)]"
federal,"[(0, 1), (1, 1), (2, 1), (6, 2), (14, 1), (36,..."
Ivani,"[(0, 1), (1, 1)]"
Silva,"[(0, 3), (1, 1), (5, 1), (13, 2), (25, 1), (72..."
Luz,"[(0, 3), (1, 1), (124, 1)]"


## Adding Inverse Document Frequency  (IDF)

In [4]:
# number of documents in the collection
n_docs = inv_index["doc_id:freq"].apply(lambda x: list(i[0] for i in x)).sum()
n_docs = len(set(n_docs))
display(Markdown("* Let's employ the number  of documents in this particular \
                  collection (N={}) in our calculations".format(n_docs)))

* Let's employ the number  of documents in this particular                   collection (N=249) in our calculations

In [5]:
inv_index["IDF"] = inv_index["doc_id:freq"].apply(lambda x: log((n_docs + 1)/len(x)))
inv_index.sample(5)

Unnamed: 0_level_0,doc_id:freq,IDF
word,Unnamed: 1_level_1,Unnamed: 2_level_1
eventualmente,"[(110, 1)]",5.521461
levá-lo,"[(87, 1)]",5.521461
federados,"[(101, 1)]",5.521461
disparam,"[(30, 1)]",5.521461
Beto,"[(68, 1), (102, 1)]",4.828314


## Strategy Implementations

In [99]:
query = ["1964","golpe","ditadura"]

def binary_vm(**kwargs):
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]

    result = inv_index.loc[set(query)]["doc_id:freq"]
    result = result.apply(lambda x: 0 if not list(w for w in x if w[0] == doc_id) else 1)
    result = result.sum()
    
    return result

def tf_vm(**kwargs):
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    
    # Get document frequency from inverted index
    term_f_in_doc = index.loc[set(query)]["doc_id:freq"]\
                      .apply(lambda x: list(w for w in x if w[0] == doc_id))\
                      .apply(lambda x: x[0][1] if not (len(x) == 0) else 0) # Extract freq 

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = term_f_in_doc.xs(term)
        score += query_score * doc_score

    return score

def tf_idf_vm(**kwargs):
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.loc[set(query)]
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = term_f_in_doc.xs(term)['doc_id:freq']
        idf_score = term_f_in_doc.xs(term)['IDF']
        score += query_score * doc_score * idf_score
        
    return score

def bm25_vm(**kwargs):
    
    index = kwargs["index"]
    query = kwargs["query"]
    doc_id = kwargs["doc_id"]
    k = kwargs["k"]
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').loc[set(query)]
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_count = term_f_in_doc.xs(term)['doc_id:freq']
        doc_score = 0 if doc_count == 0 else (doc_count * (k + 1)) / (doc_count + k)
        idf_score = term_f_in_doc.xs(term)['IDF']
        score += query_score * doc_score * idf_score

    return score

## Strategy Wrapper

In [86]:
from bisect import insort_left

query1 = ["1964","golpe","ditadura","Bolsonaro","militar"]
n = 10

def get_n_best_docs(**kwargs):
    
    n = kwargs.pop("n")
    metric = kwargs.pop("metric")
    
    if "docs" in kwargs:
        docs = kwargs.pop("docs")
    else:        
        docs = inv_index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
        docs = list(set(docs))
        
    rank = []
    for doc in docs:
        score = metric(doc_id=doc,**kwargs)
        insort_left(rank, (score,doc))
        rank = rank[-n:]

    return rank

In [90]:
all_docs = inv_index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
all_docs = list(set(all_docs))
all_docs[1:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [100]:
get_n_best_docs(n=10, k=0, docs=all_docs, index=inv_index, query=query1, metric=bm25_vm)

[(10.225260532190216, 0),
 (10.225260532190216, 24),
 (10.225260532190216, 150),
 (10.225260532190216, 164),
 (10.225260532190216, 165),
 (10.225260532190216, 171),
 (10.225260532190216, 207),
 (10.225260532190216, 215),
 (10.225260532190216, 222),
 (10.225260532190216, 229)]

In [None]:
query1 = ["1964","golpe","ditadura","Bolsonaro","militar"]
query2 = ["dólar","China","economia"]
query3 = ["tragédia","morte","coragem"]

In [101]:
metrics = [binary_vm,tf_vm,tf_idf_vm,bm25_vm,bm25_vm,bm25_vm,bm25_vm]
columns = ["Binary","TF","TF-IDF","BM25(k=0)","BM25(k=1)","BM25(k=10)","BM25(k=100)"]
k_values =[None,None,None,0,1,10,100]
rows = []

for idx in range(0,len(metrics)):
    top_10 = get_n_best_docs(n=10, k=k_values[idx], docs=all_docs,index=inv_index,
                             query=query1, metric=metrics[idx])
    
    top_10 = list(reversed(top_10))
    top_10 = [i[1] for i in top_10]
    
    rows.append(top_10)
    
display(Markdown("### Top 10 best docs by metric for query1"))
display(HTML(pd.DataFrame(list(zip(*rows)), columns=columns).to_html(index=False)))

### Top 10 best docs by metric for query1

Binary,TF,TF-IDF,BM25(k=0),BM25(k=1),BM25(k=10),BM25(k=100)
229,150,165,229,24,24,24
222,165,24,222,207,165,165
215,24,150,215,165,207,150
207,206,207,207,0,215,207
171,207,215,171,215,2,215
165,215,6,165,164,164,2
164,18,2,164,171,150,6
150,6,206,150,2,0,164
24,164,164,24,222,171,237
0,2,237,0,150,222,171


In [60]:
source = pd.read_csv("../output/results.csv")
source.head()

Unnamed: 0,title,subtitle,author,date,section,text,url
0,“A sociedade foi Rubens Paiva não os facínora...,A decisão da juíza que proíbe as Forças Armada...,F. M.,30/03/2019 00:11:08,Brasil,A juíza federal Ivani Silva da Luz de Brasíli...,https://brasil.elpais.com/brasil/2019/03/26/po...
1,Justiça suspende decisão que proibia Forças Ar...,Liminar havia sido concedida na sexta-feira a ...,Marina Rossi,30/03/2019 16:17:59,Brasil,Menos de 24 horas depois de a juíza federal Iv...,https://brasil.elpais.com/brasil/2019/03/30/po...
2,Governo Bolsonaro prega “negacionismo históric...,Marcos Napolitano professor da USP diz que o...,Regiane Oliveira,04/04/2019 22:37:48,Brasil,Quando determinou que de 31 de março 1964 u...,https://brasil.elpais.com/brasil/2019/04/05/po...
3,Quando os pais de Gabo perceberam que tinham u...,Gustavo Tatis percorre o universo de García Má...,Jesús Ruiz Mantilla,07/03/2019 16:38:56,Cultura,Quando era pequeno Luisa e Gabriel se preo...,https://brasil.elpais.com/brasil/2019/03/06/cu...
4,Rádios canadenses banem músicas de Michael Jac...,Quebec Cogeco Media toma a decisão após queixa...,Jaime Porras Ferreyra,07/03/2019 16:12:37,Cultura,Desde a manhã da última segunda-feira e ...,https://brasil.elpais.com/brasil/2019/03/06/cu...
