In [94]:
import pandas as pd
import nltk

from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [95]:
#Função para carregar o csv, preencher os espaços NaN e formatar o contéudo que ira ser utilizado
def getCSVFormattedAsDF():
    df = pd.read_csv('estadao_noticias_eleicao.csv')
    df = df.fillna('')
    df["docs"] = df["titulo"] + ' ' +  df["subTitulo"] + ' ' + df["conteudo"]
    return df

#Função para remover as pontuações do documentos
def removePunctuationDocs(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens_lists = docs.apply(lambda text: tokenizer.tokenize(text.lower()))
    return tokens_lists

#Função para remover stopwords. 
# stopwords: https://pt.wikipedia.org/wiki/Palavra_vazia
def removeStopWords(tokens_lists):
    stopword = stopwords.words('portuguese')
    filtered_tokens = tokens_lists.apply(lambda tokens: [token for token in tokens if token not in stopword])
    return filtered_tokens

#Função para remover stopwords. 
# stopwords: https://pt.wikipedia.org/wiki/Palavra_vazia                               
def getTokens(docs):
    tokens = removePunctuationDocs(df["docs"])
    filtered_tokens = removeStopWords(tokens)
    return filtered_tokens

In [96]:
def co_occurrence_matrix(docs_tokens):
    docs_bi_grams = docs_tokens.apply(lambda tokens: list(bigrams(tokens)))
    collection_big_grams = [bi_gram for doc_big_grams in docs_bi_grams for bi_gram in doc_big_grams]
    bigram_freq = nltk.FreqDist(collection_big_grams).most_common(len(collection_big_grams))
    matrix = {}
    
    for bigram in bigram_freq:
        postWord = bigram[0][1]
        word = bigram[0][0]
        count = bigram[1]
        co_occurrence_info = (postWord, count)
        
        if word not in matrix:
            matrix[word] = []
        matrix[word].append(co_occurrence_info)

        
    return matrix

In [115]:
def top3Consultas(word, matrix):
    if (word in matrix):
        co_occurrence_info = matrix[word]
        info = [(), (), (), ()]
        for index in range(len(co_occurrence_info)):
            info[index] = co_occurrence_info[index]
            if (index == 3):
                break
        return info

In [119]:
def top3ConsultasOR(words, matrix):
    dict_words = {}
    for word in words:
        info = [()] * 3
        if (word in matrix):
            co_occurrence_info = matrix[word]
            for index in range(len(co_occurrence_info)):
                info[index] = co_occurrence_info[index]
                if (index == 2):
                    break
            dict_words[word] = info
    return dict_words

In [97]:
df = getCSVFormattedAsDF()
docs_tokens = getTokens(df["docs"])

In [98]:
matrix_co_occurence = co_occurrence_matrix(docs_tokens)

In [117]:
top3Consultas("petrobrás", matrix_co_occurence)

[('paulo', 240), ('é', 90), ('graça', 51), ('presidente', 41)]

In [120]:
top3ConsultasOR(["paulo", "presidente", "graça"], matrix_co_occurence)

{'graça': [('foster', 204), ('disse', 5), ('diz', 4)],
 'paulo': [('roberto', 1110), ('skaf', 213), ('câmara', 180)],
 'presidente': [('dilma', 3350), ('luiz', 711), ('república', 536)]}

In [None]:
Escreva uma função que receba um certo termo de consulta e a matriz construída no passo 1 acima e retorneas top-3 palavras 
em ordem decrescente de frequencia.
Expanda a consulta original com os termos retornados no passo 2 acima.
Faça uma busca disjuntiva (OR) considerando a nova consulta.