In [1]:
import spacy
import logging
import pandas as pd
from gensim.models import Word2Vec

# Download do módulo do spacy para português
!python -m spacy download pt_core_news_sm

Collecting pt_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2 MB)
[K     |████████████████████████████████| 21.2 MB 1.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')


In [2]:
dados_treino = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/curso word2vec/treino.csv')
dados_teste = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/curso word2vec/teste.csv')

In [3]:
dados_treino.sample(5)

Unnamed: 0,title,text,date,category,subcategory,link
70690,Obama diz que líderes internacionais estão 'su...,"Líderes internacionais estão ""surpresos"" com o...",2016-05-26,mundo,,http://www1.folha.uol.com.br/mundo/2016/05/177...
81775,Coleção reúne tiras de Flash Gordon de jornais...,"Ao lado de Super-Homem, Batman e Tarzan, també...",2016-02-01,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2016/01...
47988,Partido das Farc se chamará Força Alternativa ...,Força Alternativa Revolucionária do Comum foi ...,2017-08-31,mundo,,http://www1.folha.uol.com.br/mundo/2017/08/191...
69946,Rivais desafiam domínio de Trump em debate rep...,Pela primeira vez desde a abertura da campanha...,2016-01-15,mundo,,http://www1.folha.uol.com.br/mundo/2016/01/172...
77797,Ator Odin Biron assume risco e revela ser gay ...,"Em fevereiro, Odin Biron, ator americano que i...",2015-04-25,ilustrada,,http://www1.folha.uol.com.br/ilustrada/2015/04...


#### Carregando modelo do Português

In [4]:
# nlp é a variável padrão para receber o modelo
nlp = spacy.load('pt_core_news_sm')

###### Testes gerais

In [5]:
texto_exemplo = 'Fortaleza é uma cidade cheia de belezas'

# Cria uma estrutura especial do spacy chamada DOC
doc = nlp(texto_exemplo)
print(doc)

Fortaleza é uma cidade cheia de belezas


In [6]:
type(doc)

spacy.tokens.doc.Doc

In [7]:
print(doc[0])
print(doc.ents)
print(doc[1])
print(doc[1].is_stop)

Fortaleza
(Fortaleza,)
é
True


### Pré-Processamento dos Dados

In [8]:
# Estrutura para otimizar a geração dos textos
textos_tratamento = (titulos.lower() for titulos in dados_treino.title)
textos_tratamento

<generator object <genexpr> at 0x7f57b86f2750>

In [9]:
def trata_textos(doc):
  tokens_validos = []

  for token in doc:
    e_valido = (not token.is_stop) and token.is_alpha # verificação se o token não é stop word e é alphabetical
    if e_valido:
      tokens_validos.append(token.text)

  if len(tokens_validos) > 2:
    return " ".join(tokens_validos)

In [10]:
texto_exemplo = 'Fortaleza, é uma "cidade"! 1235 %7 && cheia de belezas'

doc = nlp(texto_exemplo)
trata_textos(doc)

'Fortaleza cidade cheia belezas'

In [11]:
# Criando uma lista com os titulos tratados usando a estrutura do doc
textos_tratados = [trata_textos(doc) for doc in nlp.pipe(textos_tratamento,
                                                        batch_size=1000,
                                                        n_process=-1)]

In [12]:
textos_tratados[0:5]

['polêmica marine le pen abomina negacionistas holocausto',
 'macron e le pen a o turno frança revés siglas tradicionais',
 'apesar larga vitória legislativas macron terá desafios frente',
 'governo antecipa balanço e alckmin anuncia queda homicídios sp',
 'queda maio a atividade econômica sobe junho bc']

In [13]:
titulos_tratados = pd.DataFrame({"titulo": textos_tratados})
titulos_tratados

Unnamed: 0,titulo
0,polêmica marine le pen abomina negacionistas h...
1,macron e le pen a o turno frança revés siglas ...
2,apesar larga vitória legislativas macron terá ...
3,governo antecipa balanço e alckmin anuncia que...
4,queda maio a atividade econômica sobe junho bc
...,...
89995,mural há anos aeroporto recebido moradores gua...
89996,notícias schumacher boas ferrari
89997,olho bilhões governo conceder áreas petróleo
89998,moro deu a lula o papel coitadinho


In [14]:
titulos_tratados.isna().sum()

titulo    3690
dtype: int64

In [15]:
titulos_tratados = titulos_tratados.dropna().drop_duplicates()

In [17]:
lista_lista_tokens = [titulo.split(' ') for titulo in titulos_tratados.titulo]

In [None]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

# Word2Vec Builder

## Treinamento do CBOW

In [18]:
w2v_modelo_cbow = Word2Vec(sg=0, window=2, 
                           size=300, min_count=5, 
                           alpha=0.03, min_alpha=0.007)

w2v_modelo_cbow.build_vocab(lista_lista_tokens, progress_per=5000)

2021-07-26 23:13:45,756 : collecting all words and their counts
2021-07-26 23:13:45,758 : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-07-26 23:13:45,775 : PROGRESS: at sentence #5000, processed 34716 words, keeping 10129 word types
2021-07-26 23:13:45,788 : PROGRESS: at sentence #10000, processed 69298 words, keeping 14909 word types
2021-07-26 23:13:45,803 : PROGRESS: at sentence #15000, processed 103841 words, keeping 18223 word types
2021-07-26 23:13:45,819 : PROGRESS: at sentence #20000, processed 138620 words, keeping 20969 word types
2021-07-26 23:13:45,839 : PROGRESS: at sentence #25000, processed 173257 words, keeping 23410 word types
2021-07-26 23:13:45,852 : PROGRESS: at sentence #30000, processed 207976 words, keeping 25453 word types
2021-07-26 23:13:45,867 : PROGRESS: at sentence #35000, processed 242567 words, keeping 27263 word types
2021-07-26 23:13:45,885 : PROGRESS: at sentence #40000, processed 277254 words, keeping 28992 word types
2021-07

In [22]:
w2v_modelo_cbow.train(lista_lista_tokens, 
                      total_examples=w2v_modelo_cbow.corpus_count, epochs=30)

2021-07-26 23:22:59,318 : training model with 3 workers on 13006 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
2021-07-26 23:23:00,333 : EPOCH 1 - PROGRESS: at 68.61% examples, 344519 words/s, in_qsize 6, out_qsize 2
2021-07-26 23:23:00,653 : worker thread finished; awaiting finish of 2 more threads
2021-07-26 23:23:00,682 : worker thread finished; awaiting finish of 1 more threads
2021-07-26 23:23:00,686 : worker thread finished; awaiting finish of 0 more threads
2021-07-26 23:23:00,688 : EPOCH - 1 : training on 597929 raw words (502752 effective words) took 1.4s, 371220 effective words/s
2021-07-26 23:23:01,704 : EPOCH 2 - PROGRESS: at 70.26% examples, 350687 words/s, in_qsize 5, out_qsize 0
2021-07-26 23:23:02,051 : worker thread finished; awaiting finish of 2 more threads
2021-07-26 23:23:02,061 : worker thread finished; awaiting finish of 1 more threads
2021-07-26 23:23:02,074 : worker thread finished; awaiting finish of 0 more threads
2021-07-26 23

(15086309, 17937870)

In [37]:
w2v_modelo_cbow.wv.most_similar(positive=['davi'])

[('valéria', 0.6324880123138428),
 ('ilustrado', 0.6137306690216064),
 ('poemas', 0.5965014696121216),
 ('bethânia', 0.5868477821350098),
 ('contos', 0.5865117907524109),
 ('dario', 0.5831807851791382),
 ('arnaldo', 0.5813361406326294),
 ('sidney', 0.5623471736907959),
 ('caymmi', 0.5616686344146729),
 ('debbie', 0.5605387091636658)]

## Treinamento Skip-Gram

In [42]:
w2v_modelo_skipgram = Word2Vec(sg=1, window=5, 
                               size=300, min_count=5, 
                               alpha=0.03, min_alpha=0.007)

w2v_modelo_skipgram.build_vocab(lista_lista_tokens, progress_per=5000)

2021-07-26 23:35:01,905 : collecting all words and their counts
2021-07-26 23:35:01,908 : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-07-26 23:35:01,925 : PROGRESS: at sentence #5000, processed 34716 words, keeping 10129 word types
2021-07-26 23:35:01,940 : PROGRESS: at sentence #10000, processed 69298 words, keeping 14909 word types
2021-07-26 23:35:01,954 : PROGRESS: at sentence #15000, processed 103841 words, keeping 18223 word types
2021-07-26 23:35:01,967 : PROGRESS: at sentence #20000, processed 138620 words, keeping 20969 word types
2021-07-26 23:35:01,983 : PROGRESS: at sentence #25000, processed 173257 words, keeping 23410 word types
2021-07-26 23:35:01,997 : PROGRESS: at sentence #30000, processed 207976 words, keeping 25453 word types
2021-07-26 23:35:02,013 : PROGRESS: at sentence #35000, processed 242567 words, keeping 27263 word types
2021-07-26 23:35:02,028 : PROGRESS: at sentence #40000, processed 277254 words, keeping 28992 word types
2021-07

In [43]:
w2v_modelo_skipgram.train(lista_lista_tokens, 
                          total_examples=w2v_modelo_skipgram.corpus_count, epochs=30)

2021-07-26 23:35:05,092 : training model with 3 workers on 13006 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2021-07-26 23:35:06,183 : EPOCH 1 - PROGRESS: at 31.82% examples, 148335 words/s, in_qsize 5, out_qsize 0
2021-07-26 23:35:07,289 : EPOCH 1 - PROGRESS: at 65.26% examples, 150223 words/s, in_qsize 4, out_qsize 1
2021-07-26 23:35:08,224 : worker thread finished; awaiting finish of 2 more threads
2021-07-26 23:35:08,235 : worker thread finished; awaiting finish of 1 more threads
2021-07-26 23:35:08,270 : worker thread finished; awaiting finish of 0 more threads
2021-07-26 23:35:08,271 : EPOCH - 1 : training on 597929 raw words (502905 effective words) took 3.2s, 158906 effective words/s
2021-07-26 23:35:09,349 : EPOCH 2 - PROGRESS: at 30.15% examples, 141613 words/s, in_qsize 4, out_qsize 1
2021-07-26 23:35:10,432 : EPOCH 2 - PROGRESS: at 65.27% examples, 152405 words/s, in_qsize 6, out_qsize 0
2021-07-26 23:35:11,353 : worker thread finished; awa

(15087506, 17937870)

In [51]:
w2v_modelo_skipgram.wv.most_similar('futebol')

[('confusões', 0.3390807807445526),
 ('cristiane', 0.3343549370765686),
 ('várzea', 0.3302125930786133),
 ('baixe', 0.322268545627594),
 ('espn', 0.31874409317970276),
 ('rúgbi', 0.3181766867637634),
 ('gramados', 0.31760266423225403),
 ('pênaltis', 0.31572258472442627),
 ('parma', 0.31116729974746704),
 ('seleções', 0.31063342094421387)]

In [52]:
w2v_modelo_cbow.wv.save_word2vec_format('/content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_cbow.txt', binary=False)
w2v_modelo_skipgram.wv.save_word2vec_format('/content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_skipgram.txt', binary=False)

2021-07-26 23:44:30,751 : storing 13006x300 projection weights into /content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_cbow.txt
2021-07-26 23:44:33,723 : storing 13006x300 projection weights into /content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_skipgram.txt


# Classificação

In [101]:
import pickle
import numpy as np
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [63]:
nlp_c = spacy.load('pt_core_news_sm', disable=['parser', 'ner', 'tagger', 'textcat'])

w2v_modelo_cbow_carregado = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_cbow.txt')
w2v_modelo_skip_carregado = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_skipgram.txt')

2021-07-27 00:15:57,575 : loading projection weights from /content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_cbow.txt
2021-07-27 00:16:01,610 : loaded (13006, 300) matrix from /content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_cbow.txt
2021-07-27 00:16:01,615 : loading projection weights from /content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_skipgram.txt
2021-07-27 00:16:05,542 : loaded (13006, 300) matrix from /content/drive/MyDrive/Colab Notebooks/curso word2vec/modelo_skipgram.txt


In [74]:
def tokenizador(texto):
  tokens_validos = []

  doc = nlp_c(texto)

  for token in doc:
    e_valido = (not token.is_stop) and token.is_alpha # verificação se o token não é stop word e é alphabetical
    if e_valido:
      tokens_validos.append((token.text).lower())

  return tokens_validos

def combinacao_vetores_soma(palavras, modelo):
  vetor_resultante = np.zeros(300)

  for pn in palavras:
    try:
      vetor_resultante += modelo.get_vector(pn)
    except KeyError:
      pass

  return vetor_resultante

def matriz_vetores(textos, modelo):
  x = len(textos)
  y = 300
  matriz = np.zeros((x, y))

  for i in range(x):
    palavras = tokenizador(textos.iloc[i])
    matriz[i] = combinacao_vetores_soma(palavras, modelo)
  
  return matriz

In [69]:
palavras = tokenizador("texto exemplo carnaval é lindo")
print(palavras)
vetor_texto = combinacao_vetores_soma(palavras, w2v_modelo_cbow_carregado)
print(vetor_texto)

['texto', 'carnaval', 'lindo', 'dfsdfsdfsg']
[-1.46037885 -0.81958984  0.24828935 -2.08132233 -1.02650494 -0.12647618
 -1.4014217   1.19009664  0.46694312  0.40847444 -0.0280055   0.30072606
  0.27546617 -0.20799188  1.08978578  0.7655057  -1.16258472  0.14283474
 -0.32538996  1.76581847  0.5264964   1.29860345  0.97544291  0.27572726
 -0.97167648  0.69820934  0.62241239  0.91330803  0.48801127 -0.21948675
  0.63351974  0.07870368  1.35928601  0.88303885 -0.87579308  0.01355197
  0.05064841 -0.57912856  1.31571801 -0.72656734 -1.1046941  -0.78480443
 -0.54478377  0.20163465 -0.52918182  1.20637165  1.18871804 -0.34430226
 -0.77354899 -0.11728422 -0.06608269 -0.03157455 -1.13258262  1.31059946
 -0.48856129 -1.41943841 -0.17072408 -0.38371621  0.49916689 -1.52884786
 -1.55171576 -1.7597768   0.02994731 -0.61059779 -0.90203869  1.49907406
  1.52603914  2.39689646  0.32426833 -1.40826777 -0.99394915  0.0718447
  0.76289485 -0.49568949  0.79870636  1.43690552 -1.35399117  0.555282
 -0.67986

In [75]:
matriz_vetores_treino_cbow = matriz_vetores(dados_treino.title, w2v_modelo_cbow_carregado)
matriz_vetores_teste_cbow = matriz_vetores(dados_teste.title, w2v_modelo_cbow_carregado)

matriz_vetores_treino_skip = matriz_vetores(dados_treino.title, w2v_modelo_skip_carregado)
matriz_vetores_teste_skip = matriz_vetores(dados_teste.title, w2v_modelo_skip_carregado)

In [88]:
print(matriz_vetores_treino_cbow.shape)

(90000, 300)


In [95]:
lr = LogisticRegression(max_iter=300, multi_class='multinomial')
lr.fit(matriz_vetores_treino_cbow, dados_treino.category)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [96]:
label_prevista = lr.predict(matriz_vetores_teste_cbow)
lr.score(matriz_vetores_teste_cbow, dados_teste.category)

0.7865256179008434

In [97]:
cr = classification_report(dados_teste.category, label_prevista)
print(cr)

              precision    recall  f1-score   support

     colunas       0.81      0.71      0.76      6103
   cotidiano       0.64      0.81      0.71      1698
     esporte       0.93      0.86      0.89      4663
   ilustrada       0.13      0.85      0.22       131
     mercado       0.83      0.78      0.81      5867
       mundo       0.74      0.84      0.79      2051

    accuracy                           0.79     20513
   macro avg       0.68      0.81      0.70     20513
weighted avg       0.82      0.79      0.80     20513



In [98]:
lr_skip = LogisticRegression(max_iter=300, multi_class='multinomial')
lr_skip.fit(matriz_vetores_treino_skip, dados_treino.category)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [99]:
label_prevista_skip = lr_skip.predict(matriz_vetores_teste_skip)
lr_skip.score(matriz_vetores_teste_skip, dados_teste.category)

0.7946180470920879

In [100]:
cr_skip = classification_report(dados_teste.category, label_prevista_skip)
print(cr_skip)

              precision    recall  f1-score   support

     colunas       0.81      0.72      0.76      6103
   cotidiano       0.64      0.81      0.71      1698
     esporte       0.93      0.87      0.90      4663
   ilustrada       0.14      0.87      0.24       131
     mercado       0.84      0.79      0.82      5867
       mundo       0.76      0.84      0.80      2051

    accuracy                           0.79     20513
   macro avg       0.69      0.82      0.71     20513
weighted avg       0.82      0.79      0.80     20513



In [102]:
with open('/content/drive/MyDrive/Colab Notebooks/curso word2vec/lr_cbow.pkl', 'wb') as f:
  pickle.dump(lr, f)

with open('/content/drive/MyDrive/Colab Notebooks/curso word2vec/lr_skip.pkl', 'wb') as f:
  pickle.dump(lr_skip, f)