In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Vector Model

In [1]:
from ast import literal_eval
from math import log

from IPython.display import Markdown, display, HTML
import pandas as pd

## Load Data

In [2]:
inv_index = pd.read_csv("../output/inverted_index.csv").set_index(['word'])\
            .drop('vivoÉ').drop('ônibusFoi') #remove nltk tokenization mistake
inv_index["doc_id:freq"] = inv_index["doc_id:freq"].apply(lambda x: literal_eval(x))
display(Markdown("## Inverted Index"))

inv_index.head(5)

## Inverted Index

Unnamed: 0_level_0,doc_id:freq
word,Unnamed: 1_level_1
juíza,"[(0, 2), (1, 1)]"
federal,"[(0, 1), (1, 1), (2, 1), (6, 2), (14, 1), (36,..."
Ivani,"[(0, 1), (1, 1)]"
Silva,"[(0, 3), (1, 1), (5, 1), (13, 2), (25, 1), (72..."
Luz,"[(0, 3), (1, 1), (124, 1)]"


## Adding Inverse Document Frequency  (IDF)

In [3]:
# number of documents in the collection
n_docs = inv_index["doc_id:freq"].apply(lambda x: list(i[0] for i in x)).sum()
n_docs = len(set(n_docs))
display(Markdown("* Let's employ the number  of documents in this particular \
                  collection (N={}) in our calculations".format(n_docs)))

* Let's employ the number  of documents in this particular                   collection (N=249) in our calculations

In [5]:
inv_index["IDF"] = inv_index["doc_id:freq"].apply(lambda x: log((n_docs + 1)/len(x)))
inv_index.sample(5)

Unnamed: 0_level_0,doc_id:freq,IDF
word,Unnamed: 1_level_1,Unnamed: 2_level_1
discutem,"[(93, 1)]",5.521461
gastos,"[(10, 1), (27, 2), (36, 7), (46, 1), (114, 1),...",3.123566
digamos,"[(12, 1), (24, 2)]",4.828314
projetou,"[(143, 1)]",5.521461
extraditado,"[(62, 1), (72, 1)]",4.828314


## Strategy Implementations

### Binary Vector Model

In [10]:
query = ["1964","golpe","ditadura"]

def binary_vector_model(index, query, doc_id):
    result = inv_index.loc[set(query)]["doc_id:freq"]
    result = result.apply(lambda x: 0 if not list(w for w in x if w[0] == doc_id) else 1)
    result = result.sum()
    
    return result

def term_frequency(index, query, doc_id):
    
    # Calculate frequency in query
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    
    # Get document frequency from inverted index
    term_f_in_doc = index.loc[set(query)]["doc_id:freq"]\
                      .apply(lambda x: list(w for w in x if w[0] == doc_id))\
                      .apply(lambda x: x[0][1]) # Extract freq 

    # Combine doc freq and query frequency to make score
    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = term_f_in_doc.xs(term)
        score += query_score * doc_score

    return score

def tf_idf(index, query, doc_id):
    query_fd = {}
    for term in query:
        if term not in query_fd:
            query_fd[term] = 0

        query_fd[term] += 1

    term_f_in_doc = index.loc[set(query)]
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: list(w for w in x if w[0] == doc_id))
    term_f_in_doc["doc_id:freq"] = term_f_in_doc["doc_id:freq"].\
                                   apply(lambda x: x[0][1] if not (len(x) == 0) else 0)

    score = 0
    for term, freq in query_fd.items():
        query_score = freq 
        doc_score = term_f_in_doc.xs(term)['doc_id:freq']
        idf_score = term_f_in_doc.xs(term)['IDF']
        score += query_score * doc_score * idf_score
        
    return score

In [11]:
query = ["1964","golpe","ditadura","Bolsonaro","militar"]

tf_idf(inv_index, query, 1)

7.870424748852292