# Gerador de modelo Word2vec

In [2]:
import os, glob, pickle
import spacy
from spacy import displacy
from tqdm import tqdm

import string
from string import punctuation
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec, word2vec

import pandas as pd
import time
import re
import json
import os
from pprint import pprint
from IPython.display import clear_output

In [36]:
outputs = '../outputs/'

In [2]:
nlp = spacy.load("pt_core_news_sm")

# retrieve data from MongoDB

In [3]:
import pymongo
from pymongo import MongoClient
import dns
client = MongoClient()

In [4]:
db = client.twitter
collection_ufmg = db.ufmg_filtered

In [5]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [6]:
file_len = 7503436
objects = collection_ufmg.find({})
tweets_list = []
data = pd.DataFrame()
count = 0
for obj in objects:
    date = obj['date']
    if isinstance(date, int):
        date = date/1000
    else: date = time.mktime(time.strptime(date[:10], '%Y-%m-%d'))
    #if date >= 1451617260: # 1451617260 = 2016-01-01
    obj['date'] = time.strftime('%Y-%m-%d', time.localtime(date))
    obj['text'] = re.sub(r'\\', '', obj['text'])
    if 'extended_tweet' in obj: 
        obj['extended_tweet'] = re.sub(r'\\', '', obj['extended_tweet']['full_text'])
    tweets_list.append(obj)

    count += 1
    if count % (int(file_len/1000)) == 0:
        
        #here I reset the list to save memory usage
        if tweets_list: data = data.append(tweets_list, ignore_index=True)
        tweets_list = []
        
        clear_output()
        
        frac = count/file_len*100
        print("%.1f" % frac, "% done", sep="")

100.0% done


In [41]:
data = data[['text']]

# clear text

In [None]:
#testing regex
text = 'estou com dengue, estou mesmo, nada restou'
text = re.sub('(^|\W)estou', r'\1to', text)
text = re.sub('(\W)com(\W)', r'\1cm\2', text)

text

In [7]:
def clear_text(text):
    text = re.sub('a+', 'a', text)
    text = re.sub('e+', 'e', text)
    text = re.sub('i+', 'i', text)
    text = re.sub('o+', 'o', text)
    text = re.sub('u+', 'u', text)
    text = re.sub('(^|\W)estou', r'\1to', text)
    text = re.sub('(\W)com(\W)', r'\1cm\2', text)
    text = re.sub('(\W)muito(\W)', r'\1mto\2', text)
    #text = re.sub('https?\:\/\/[\w\.\/\?\\]+', '', text) # remove URLs, omitted to avoid errors
    return text

In [8]:
def remove_special_char(text):
    text = re.sub('[áàãâ]', 'a', text)
    text = re.sub('[óòõô]', 'o', text)
    text = re.sub('[éèê]', 'e', text)
    text = re.sub('[íì]', 'i', text)
    text = re.sub('[úù]', 'u', text)
    text = re.sub('ç', 'c', text)
    return text

# build tokens

In [13]:
def build_tokens(text):
    if text is not None: tokens = tknzr.tokenize(text.translate(str.maketrans('', '', string.punctuation)))
    else: tokens = []
    return tokens

In [42]:
data['text_original'] = data['text']
data['text'] = data['text'].apply(lambda x: clear_text(x.lower()))
data['tokens'] = data['text'].apply(lambda x: build_tokens(x))

#### build tokens for encoded text (just removed special characters)

In [43]:
data['text_encoded'] = data['text'].apply(lambda x: remove_special_char(x))
data['tokens_encoded'] = data['text_encoded'].apply(lambda x: build_tokens(x))

#### texts to variables

In [34]:
VT = list(data['tokens'])
VT_encoded = list(data['tokens_encoded'])

#### build tokens and variable with bigrams
* source: https://radimrehurek.com/gensim/models/phrases.html

In [19]:
from gensim.models.phrases import Phrases, Phraser

In [44]:
sentences = list(data['tokens_encoded'])
phrases = Phrases(sentences, min_count=10, threshold=1)

VT_encoded_bigrams = []
bigram = Phraser(phrases)
for sent in bigram[sentences]:
    VT_encoded_bigrams.append(sent)

In [46]:
print(VT_encoded_bigrams[:5])

[['musica', 'dos_vizinhos', 'zika', 'aq', 'da_comunidade'], ['sei_que', 'estou', 'atrasada', 'mas', 'feliz_ano', 'novo', 'que', 'esse_ano', 'seja', 'zika', 'para', 'todos_nos', 'ask_5sosfam'], ['o', 'ano_ja', 'comeca_assim', 'eu', 'em_casa', 'com_suspeita', 'de', 'dengue'], ['vitoriawg', 'imagina_qnd', 'a', 'lua', 'entrar_em', 'aquario', 'eu_vo', 'vira', 'de', 'humanas', 'vo', 'aplaudi_o', 'sol', 'faze_uma', 'tatto', 'iludi', 'os_boy', 'vai', 'se', 'zika_virus'], ['o', 'mais', 'zika', 'e', 'quem', 'tem_palavra', 'e', 'nao', 'que_vende', 'po']]


In [47]:
#Save collocation model.
bigram.save(os.path.join(outputs, "bigram_model.pkl"))

## Word2vec 

In [None]:
#v1 = virus_tweets, v2 = virus_tweets_encoded, v3 = virus_tweets_encoded_bigrams
for file, var_list in zip(['virus_tweets.w2v', 'virus_tweets_encoded.w2v', 'virus_tweets_encoded_bigrams.w2v'], 
                          [VT, VT_encoded, VT_encoded_bigrams]):
    model = Word2Vec(sentences=var_list, workers=32)
    model.save(file)

### Exploring the first model

In [None]:
file = 'virus_tweets.w2v'
model = Word2Vec.load(file)

In [40]:
for i, word in enumerate(model.wv.vocab):
    if i == 10:
        break
    print(word)

musica
dos
vizinhos
zika
aq
da
comunidade
sei
que
estou


In [42]:
print(list(model.wv['dengue']))

[2.3886912,
 0.28217658,
 -1.9874773,
 -0.51558894,
 2.1879008,
 -0.19042288,
 0.6908539,
 -1.0632348,
 -0.67359257,
 3.0175622,
 2.2130032,
 -1.4433424,
 2.760831,
 -2.1444695,
 -0.2821184,
 0.58656615,
 1.0061878,
 1.1807309,
 0.54593456,
 -2.3770607,
 0.43160978,
 0.050366312,
 0.8009387,
 -1.0630683,
 -0.29855117,
 1.0814375,
 0.6338707,
 -0.58640933,
 -3.4926078,
 2.4008555,
 -0.5238663,
 -0.7037424,
 0.7415925,
 -0.30719492,
 0.9929472,
 5.1566324,
 0.24720924,
 -3.5077336,
 -2.0185099,
 -1.5160787,
 -0.6066206,
 1.5647931,
 2.6941617,
 -2.4217505,
 3.4705436,
 -0.01701793,
 0.87891155,
 1.8238404,
 -1.3465565,
 -1.3153664,
 2.1167681,
 -0.6209916,
 -3.9595716,
 -1.1521416,
 -2.2909615,
 -3.2971458,
 2.292861,
 0.5738925,
 -2.6859908,
 -0.5351569,
 2.9647572,
 -0.8235569,
 1.3786242,
 -2.7816575,
 1.4543685,
 -2.0188134,
 1.541914,
 -1.7891498,
 0.09042452,
 1.3875531,
 3.1705196,
 1.4111038,
 0.6391565,
 1.4406295,
 4.193468,
 -0.047331657,
 -2.1729064,
 -0.7105255,
 0.08725463,

In [43]:
len(model.wv.vocab)

179911

In [46]:
model.wv.most_similar(positive=['dengue'], topn=20)

[('aedes', 0.6367322206497192),
 ('denguezika', 0.6295568943023682),
 ('de', 0.6234614849090576),
 ('virus', 0.609860360622406),
 ('zika', 0.5731625556945801),
 ('que', 0.5616250038146973),
 ('a', 0.5592842102050781),
 ('rt', 0.5592179298400879),
 ('e', 0.5446224212646484),
 ('o', 0.5232881307601929),
 ('zikadengue', 0.5191036462783813),
 ('no', 0.5125505924224854),
 ('nao', 0.5100154876708984),
 ('doenca', 0.5021892189979553),
 ('se', 0.49738809466362),
 ('ja', 0.4926435947418213),
 ('mas', 0.48960384726524353),
 ('aegypti', 0.47452616691589355),
 ('httpstcobvxuoriibu', 0.4638242721557617),
 ('la', 0.4629664421081543)]

In [47]:
model.corpus_total_words

60306788