In [1]:
import pandas as pd

import numpy as np

from scipy import sparse

import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from collections import Counter

In [2]:
news = pd.read_csv("estadao_noticias_eleicao.csv", encoding="utf-8")
news = news.replace(np.nan, '', regex = True)
news.head()

Unnamed: 0,timestamp,titulo,subTitulo,conteudo,url,idNoticia
0,2014-12-31T00:00:00Z,PT espera 30 mil pessoas em festa na Esplanada,Objetivo é demonstrar apoio popular a Dilma e ...,BRASÍLIA - Após o desgaste provocado com o lan...,"http://politica.estadao.com.br/noticias/geral,...",1
1,2014-12-31T00:00:00Z,Alckmin toma posse de olho no Planalto,Governador reeleito tenta amarrar tucanos paul...,"Reeleito em outubro, o governador tucano Geral...","http://politica.estadao.com.br/noticias/geral,...",2
2,2014-12-31T00:00:00Z,Seis obstáculos e desafios do segundo mandato ...,"Em meio a escândalo de corrupção, presidente t...",1. Rearranjo das contas A nova equipe econôm...,"http://politica.estadao.com.br/noticias/geral,...",3
3,2014-12-31T00:00:00Z,,Veja as principais fotos do dia e dos eventos ...,,"http://fotos.estadao.com.br/fotos/politica,dil...",4
4,2014-12-31T00:00:00Z,,Veja as principais fotos do dia e dos eventos ...,,"http://fotos.estadao.com.br/fotos/politica,dil...",5


In [3]:
content = news.titulo + " " + news.subTitulo + " " + news.conteudo
content = content.fillna("")

In [4]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
tokens_lists = content.apply(lambda text: tokenizer.tokenize(text.lower()))

In [6]:
stopword_ = stopwords.words('portuguese')
filtered_tokens = tokens_lists.apply(lambda tokens: [token for token in tokens if token not in stopword_])

In [7]:
tokens = [token for tokens_list in filtered_tokens for token in tokens_list]

In [8]:
matrix, vocab = co_occurrence_matrix(tokens)

In [9]:
consultable_matrix = matrix.tocsr()

In [10]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

In [11]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3

In [12]:
inverted_vocab = {vocab[key]: key for key in vocab}

In [52]:
def top3(word):
    co_ocorr_words = np.reshape(consultable_matrix[vocab[word]].toarray(), -1)
    
    dic = {}
    
    for i in range(len(co_ocorr_words)):
        dic[inverted_vocab[i]] = co_ocorr_words[i]
        
    return sorted(dic.items(), key=lambda x: x[1], reverse = True)[0:3]

In [53]:
print(top3('dilma'))
print(top3('petrobrás'))

[('rousseff', 3852), ('disse', 312), ('é', 308)]
[('paulo', 240), ('é', 90), ('graça', 51)]


In [19]:
def expand(word):
    query = []
    query.append(word)
    query.extend(i[0] for i in top3(word))
    
    return " ".join(i for i in query)

In [48]:
print(expand('dilma'))
print(expand('lava'))
print(expand('michel'))
print(expand('aécio'))
print(expand('petrobrás'))

dilma rousseff disse é
lava jato lato porque
michel temer saliba louis
aécio neves disse é
petrobrás paulo é graça


In [21]:
def gera_tokens(df):
    indexes = defaultdict(set)
    for index,row in df.iterrows():
        if isinstance(row['titulo'], str):
            titulo_tokens = (word.lower() for word in (nltk.word_tokenize(row['titulo'])))
        if isinstance(row['subTitulo'], str):
            subTitulo_tokens = (word.lower() for word in (nltk.word_tokenize(row['subTitulo'])))
        if isinstance(row['conteudo'], str):
            conteudo_tokens = (word.lower() for word in (nltk.word_tokenize(row['conteudo'])))
            a = Counter(word.lower() for word in (nltk.word_tokenize(row['conteudo']))) #TF (term frequency)
        #if isinstance(row['url'], str):
            #url_tokens = (word.lower() for word in (nl.word_tokenize(row['url'])))
        
        
        for token in titulo_tokens:
            indexes[token].add( (int(row['idNoticia']), a[token]) )
                
        for token in subTitulo_tokens:
            indexes[token].add( (int(row['idNoticia']), a[token]) )

        for token in conteudo_tokens:
            indexes[token].add( ( int(row['idNoticia']), a[token] ) )
            
        #for token in url_tokens:
            #indexes[token].add( (int(row['idNoticia']), a[token]) )
        
    return indexes

In [22]:
dic = gera_tokens(news)

In [23]:
for key in dic.keys():
    try:
        dic[key] = (dic[key],  np.log( (len(dic)+1) / len(dic[key]) ) )
    except:
        raise

In [33]:
def tfidf(query):
    tfidf_dict = {}
    for term in query:
        for doc_tf in dic[term][0]:
            if (doc_tf[0] in  tfidf_dict):               
                 tfidf_dict[doc_tf[0]] += doc_tf[1] * dic[term][1]
            else:
                tfidf_dict[doc_tf[0]] = doc_tf[1] * dic[term][1]
    
    return sorted(tfidf_dict.items(), key= lambda x: x[1], reverse = True)

In [44]:
def search(docs):
      
    tops = sorted(docs, key= lambda x: x[1], reverse = True)[0:10]    
        
    return [doc[0] for doc in tops[0:10]]

In [61]:
print(search(tfidf('dilma')))
q1 = tfidf('dilma') + tfidf('rousseff') + tfidf('disse') + tfidf('é')
print(search(q1)) 

[155, 6554, 3942, 7017, 5129, 7, 6244, 5683, 7161, 5018]
[7, 6554, 7017, 155, 5683, 7158, 5129, 3942, 7161, 6554]
