In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Vector Model

In [2]:
from ast import literal_eval
from math import log

from IPython.display import Markdown, display, HTML
import pandas as pd

## Load Data

In [3]:
inv_index = pd.read_csv("../output/inverted_index.csv").set_index(['word'])\
            .drop('vivoÉ').drop('ônibusFoi') #remove nltk tokenization mistake
inv_index["doc_id:freq"] = inv_index["doc_id:freq"].apply(lambda x: literal_eval(x))
display(Markdown("## Inverted Index"))

inv_index.head(5)

## Inverted Index

Unnamed: 0_level_0,doc_id:freq
word,Unnamed: 1_level_1
juíza,"[(0, 2), (1, 1)]"
federal,"[(0, 1), (1, 1), (2, 1), (6, 2), (14, 1), (36,..."
Ivani,"[(0, 1), (1, 1)]"
Silva,"[(0, 3), (1, 1), (5, 1), (13, 2), (25, 1), (72..."
Luz,"[(0, 3), (1, 1), (124, 1)]"


## Adding Inverse Document Frequency  (IDF)

In [4]:
# number of documents in the collection
n_docs = inv_index["doc_id:freq"].apply(lambda x: list(i[0] for i in x)).sum()
n_docs = len(set(n_docs))
display(Markdown("* Let's employ the number  of documents in this particular \
                  collection (N={}) in our calculations".format(n_docs)))

* Let's employ the number  of documents in this particular                   collection (N=249) in our calculations

In [5]:
inv_index["IDF"] = inv_index["doc_id:freq"].apply(lambda x: log((n_docs + 1)/len(x)))
inv_index.sample(5)

Unnamed: 0_level_0,doc_id:freq,IDF
word,Unnamed: 1_level_1,Unnamed: 2_level_1
eventualmente,"[(110, 1)]",5.521461
levá-lo,"[(87, 1)]",5.521461
federados,"[(101, 1)]",5.521461
disparam,"[(30, 1)]",5.521461
Beto,"[(68, 1), (102, 1)]",4.828314


## Strategy Implementations

### Binary Vector Model

In [75]:
query = ["1964","golpe","ditadura"]

def binary_vm(index, query, doc_id):
    result = inv_index.loc[set(query)]["doc_id:freq"]
    result = result.apply(lambda x: 0 if not list(w for w in x if w[0] == doc_id) else 1)
    result = result.sum()
    
    return result

def term_frequency_vm(index, query, doc_id):
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    
    # Get document frequency from inverted index
    term_f_in_doc = index.loc[set(query)]["doc_id:freq"]\
                      .apply(lambda x: list(w for w in x if w[0] == doc_id))\
                      .apply(lambda x: x[0][1]) # Extract freq 

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = term_f_in_doc.xs(term)
        score += query_score * doc_score

    return score

def tf_idf_vm(index, query, doc_id):
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.loc[set(query)]
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = term_f_in_doc.xs(term)['doc_id:freq']
        idf_score = term_f_in_doc.xs(term)['IDF']
        score += query_score * doc_score * idf_score
        
    return score

def bm25_vm(index, query, doc_id, k=1):
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    # Get document frequency from inverted index
    term_f_in_doc = index.reset_index().set_index('word').loc[set(query)]
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_count = term_f_in_doc.xs(term)['doc_id:freq']
        doc_score = (doc_count * (k + 1)) / (doc_count + k)
        idf_score = term_f_in_doc.xs(term)['IDF']
        score += query_score * doc_score * idf_score

    return score

In [63]:
all_docs = inv_index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
all_docs = list(set(all_docs))
all_docs

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [72]:
from bisect import insort_left

query1 = ["1964","golpe","ditadura","Bolsonaro","militar"]
n = 10

def get_n_best_docs(index, query, metric, docs=None, n=10):
    if docs is None:
        docs = inv_index["doc_id:freq"].apply(lambda x: list(w[0] for w in x)).sum()
        docs = list(set(docs))
        
    rank = []
    for doc in docs:
        score = metric(index, query, doc)
        insort_left(rank, (score,doc))
        rank = rank[-n:]

    return rank

In [76]:
get_n_best_docs(inv_index,query1,bm25_vm, docs=all_docs)

[(13.691625028629515, 150),
 (14.005126170608131, 222),
 (14.361083266181875, 2),
 (14.518135987616619, 171),
 (15.293126871827551, 164),
 (15.3159671009208, 215),
 (15.401073920284263, 0),
 (16.40301573682297, 165),
 (16.89266271505585, 207),
 (17.84101288898244, 24)]

In [10]:
query1 = ["1964","golpe","ditadura","Bolsonaro","militar"]
query2 = ["dólar","China","economia"]
query3 = ["tragédia","morte","coragem"]

k = 1
print(bm25_vm(inv_index,query,0,k))
print(tf_idf_vm(inv_index, query, 1))
print(binary_vm(inv_index, query, 1))

15.401073920284263
7.870424748852292
3


In [60]:
source = pd.read_csv("../output/results.csv")
source.head()

Unnamed: 0,title,subtitle,author,date,section,text,url
0,“A sociedade foi Rubens Paiva não os facínora...,A decisão da juíza que proíbe as Forças Armada...,F. M.,30/03/2019 00:11:08,Brasil,A juíza federal Ivani Silva da Luz de Brasíli...,https://brasil.elpais.com/brasil/2019/03/26/po...
1,Justiça suspende decisão que proibia Forças Ar...,Liminar havia sido concedida na sexta-feira a ...,Marina Rossi,30/03/2019 16:17:59,Brasil,Menos de 24 horas depois de a juíza federal Iv...,https://brasil.elpais.com/brasil/2019/03/30/po...
2,Governo Bolsonaro prega “negacionismo históric...,Marcos Napolitano professor da USP diz que o...,Regiane Oliveira,04/04/2019 22:37:48,Brasil,Quando determinou que de 31 de março 1964 u...,https://brasil.elpais.com/brasil/2019/04/05/po...
3,Quando os pais de Gabo perceberam que tinham u...,Gustavo Tatis percorre o universo de García Má...,Jesús Ruiz Mantilla,07/03/2019 16:38:56,Cultura,Quando era pequeno Luisa e Gabriel se preo...,https://brasil.elpais.com/brasil/2019/03/06/cu...
4,Rádios canadenses banem músicas de Michael Jac...,Quebec Cogeco Media toma a decisão após queixa...,Jaime Porras Ferreyra,07/03/2019 16:12:37,Cultura,Desde a manhã da última segunda-feira e ...,https://brasil.elpais.com/brasil/2019/03/06/cu...
