# Building an index

In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
from functools import reduce
from math import log10
import re

#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')

import pandas as pd
from numpy import arange
from nltk.util import ngrams    
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize, stem
from IPython.display import Markdown, display, HTML

%matplotlib inline
default_stopwords = set(stopwords.words('portuguese'))

In [3]:
def split_spread_words(words, delim):

    """ Split then spread alpha word with certain delimiters.

    Split words with alphabetical characters that have certain 
    delimiters then spread the resulting words across the corpus.

    :param list corpus: list of words.
    :param str delim: target delimiter.

    :return: updated list of words 

    :rtype: list
    """
    
    new_words = []
    for word in words:
        if any(c.isalnum() for c in word):
            new_words.extend(word.split(delim))
        else:
            new_words.append(word)

    return new_words


def melt_merge_zipf_dfs(df_orig, df_stem, data_grain):

    """ Melt then merge zipf stats dfs with and without stemming.

    Melt two dfs containing zipf statistics generated with and
    without stemming and merge both into a single dataframe.
    
    :param pandas.core.frame.DataFrame df_orig: zipf df without stemming.
    :param pandas.core.frame.DataFrame dif_stem: zipf df with stemming
    :param str data_gram: name of the n-gram (e.g bigram).

    :return: resulting df containing both given dfs. 

    :rtype: pandas.core.frame.DataFrame
    """
    
    melt_df_orig = df_orig.melt(id_vars=[data_grain, 'Freq.','ln(Pr)'], var_name = "ranking",
                          value_vars=['ln(r)', 'ln(pred_r)'])
    melt_df_orig["stemming"] = "no_stemming"
    
    melt_df_stem = df_stem.melt(id_vars=[data_grain, 'Freq.','ln(Pr)'], var_name = "ranking",
                               value_vars=['ln(r)', 'ln(pred_r)'])
    melt_df_stem["stemming"] = "stemming"


    melt_df = pd.concat([melt_df_orig, melt_df_stem])

    return melt_df

## Load Data

In [4]:
data = pd.read_csv("../output/results.csv")
data.head()

Unnamed: 0,title,subtitle,author,date,section,text,url
0,“A sociedade foi Rubens Paiva não os facínora...,A decisão da juíza que proíbe as Forças Armada...,F. M.,30/03/2019 00:11:08,Brasil,A juíza federal Ivani Silva da Luz de Brasíli...,https://brasil.elpais.com/brasil/2019/03/26/po...
1,Justiça suspende decisão que proibia Forças Ar...,Liminar havia sido concedida na sexta-feira a ...,Marina Rossi,30/03/2019 16:17:59,Brasil,Menos de 24 horas depois de a juíza federal Iv...,https://brasil.elpais.com/brasil/2019/03/30/po...
2,Governo Bolsonaro prega “negacionismo históric...,Marcos Napolitano professor da USP diz que o...,Regiane Oliveira,04/04/2019 22:37:48,Brasil,Quando determinou que de 31 de março 1964 u...,https://brasil.elpais.com/brasil/2019/04/05/po...
3,Quando os pais de Gabo perceberam que tinham u...,Gustavo Tatis percorre o universo de García Má...,Jesús Ruiz Mantilla,07/03/2019 16:38:56,Cultura,Quando era pequeno Luisa e Gabriel se preo...,https://brasil.elpais.com/brasil/2019/03/06/cu...
4,Rádios canadenses banem músicas de Michael Jac...,Quebec Cogeco Media toma a decisão após queixa...,Jaime Porras Ferreyra,07/03/2019 16:12:37,Cultura,Desde a manhã da última segunda-feira e ...,https://brasil.elpais.com/brasil/2019/03/06/cu...


### Tokenize and Filter text

In [5]:
data["words"] = data["text"].apply(lambda x: word_tokenize(x))

# Remove words that don't have at least one alphabetical character 
data["words"] = data["words"].apply(lambda words: list((word for word in words if any(c.isalnum() for c in word))))

# Remove hyphen at end of word
data["words"] = data["words"].apply(lambda words: list(word[:-1] if word[-1] == '-' else word for word in words))

# Split words joined by en dash
data["words"] = data["words"].apply(lambda words: list(word for line in words for word in line.split('–')))
# different encoding 
data["words"] = data["words"].apply(lambda words: list(word for line in words for word in line.split('—')))

# Split words joined by dot if they are alphabetical
data["words"] = data["words"].apply(lambda words: split_spread_words(words, '.'))

# Remove lone punctuation from the splits
data["words"] = data["words"].apply(lambda words: list(word for word in words if any(c.isalnum() for c in word)))

# Remove stopwords
data["words"] = data["words"].apply(lambda words: list(word for word in words if word.lower() not in default_stopwords))

In [6]:
data.head()

Unnamed: 0,title,subtitle,author,date,section,text,url,words
0,“A sociedade foi Rubens Paiva não os facínora...,A decisão da juíza que proíbe as Forças Armada...,F. M.,30/03/2019 00:11:08,Brasil,A juíza federal Ivani Silva da Luz de Brasíli...,https://brasil.elpais.com/brasil/2019/03/26/po...,"[juíza, federal, Ivani, Silva, Luz, Brasília, ..."
1,Justiça suspende decisão que proibia Forças Ar...,Liminar havia sido concedida na sexta-feira a ...,Marina Rossi,30/03/2019 16:17:59,Brasil,Menos de 24 horas depois de a juíza federal Iv...,https://brasil.elpais.com/brasil/2019/03/30/po...,"[Menos, 24, horas, juíza, federal, Ivani, Silv..."
2,Governo Bolsonaro prega “negacionismo históric...,Marcos Napolitano professor da USP diz que o...,Regiane Oliveira,04/04/2019 22:37:48,Brasil,Quando determinou que de 31 de março 1964 u...,https://brasil.elpais.com/brasil/2019/04/05/po...,"[determinou, 31, março, 1964, estratégia, polê..."
3,Quando os pais de Gabo perceberam que tinham u...,Gustavo Tatis percorre o universo de García Má...,Jesús Ruiz Mantilla,07/03/2019 16:38:56,Cultura,Quando era pequeno Luisa e Gabriel se preo...,https://brasil.elpais.com/brasil/2019/03/06/cu...,"[pequeno, Luisa, Gabriel, preocupavam, menino,..."
4,Rádios canadenses banem músicas de Michael Jac...,Quebec Cogeco Media toma a decisão após queixa...,Jaime Porras Ferreyra,07/03/2019 16:12:37,Cultura,Desde a manhã da última segunda-feira e ...,https://brasil.elpais.com/brasil/2019/03/06/cu...,"[Desde, manhã, última, segunda-feira, sucessos..."


## Inverted Index with Frequency

In [7]:
inverted_index = {} 
for doc_id, text in enumerate(data["words"]):
    fdist = FreqDist(text)
    for word in fdist:
        freq = fdist[word]
        if word not in inverted_index:
            inverted_index[word] = []
        
        inverted_index[word].append((doc_id,freq))           

In [8]:
rows = []
columns = ["word","doc_id:freq"]
for word in inverted_index:
    row = [word, inverted_index[word]]
    rows.append(row)

index_df = pd.DataFrame(rows, columns=columns)
display(Markdown("***"))
display(Markdown("### Resulting Inverted Index"))
display(HTML(index_df.sample(15).to_html(index=False)))
display(Markdown("***"))

***

### Resulting Inverted Index

word,doc_id:freq
exponencial,"[(21, 1)]"
descobrir,"[(12, 1), (81, 1), (150, 3), (176, 1)]"
487,"[(132, 1)]"
chamamos,"[(9, 1), (150, 1)]"
deflação,"[(83, 2)]"
prontidão,"[(75, 1)]"
silencie,"[(32, 1)]"
premiação,"[(222, 1)]"
mackenzista,"[(215, 1)]"
estabelecendo,"[(142, 1), (165, 1)]"


***

In [9]:
index_df.to_csv("../output/inverted_index.csv", index=False)

# Retrieval Approaches

## Document At A Time Retrieval

In [10]:
def retrieve_by_doc(index, query, k):
    try:
        import Queue as Q  # ver. < 3.0
    except ImportError:
        import queue as Q

    result = []
    q = Q.PriorityQueue()
    L = index.loc[lambda df: df.word.isin(query)]

    all_docs = set(index["doc_id:freq"].\
                   apply(lambda pairs: list(doc_id for doc_id, freq in pairs)).sum())
    
    base_documents = L["doc_id:freq"].sum()
    
    for doc in all_docs:
        score = base_documents
        score = [fq for d_id, fq in score if d_id == doc]
        if score == []:
            score = 0
        else:
            score = reduce(lambda a,b : a+b,score)

        q.put(((-1) * score,doc))

    for i in range(k):
        pair = q.get()
        pair = ((-1) *pair[0],pair[1])
        result.append(pair)
        
    return result

In [11]:
retrieve_by_doc(index_df, ["juíza","outorga","Instagram"], 5)

[(3, 219), (2, 0), (2, 218), (1, 1), (1, 85)]

## Term At A Time Retrieval

In [12]:
def retrieve_by_term(index, query, k):
    try:
        import Queue as Q  # ver. < 3.0
    except ImportError:
        import queue as Q

    A = {}
    result = []
    R = Q.PriorityQueue()
    q = Q.PriorityQueue()
    L = index.loc[lambda df: df.word.isin(query)]

    for doc_id, freq in L["doc_id:freq"].sum():
        if doc_id not in A:
            A[doc_id] = 0

        A[doc_id] += freq

    for doc_id, score in A.items():
        q.put(((-1) * score,doc_id))    


    for i in range(k):
        pair = q.get()
        pair = ((-1) *pair[0],pair[1])
        result.append(pair)
    
    return result

In [13]:
retrieve_by_term(index_df, ["juíza","outorga","Instagram"], 5)

[(3, 219), (2, 0), (2, 218), (1, 1), (1, 85)]