In [38]:
import json
import nltk
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB,GaussianNB
import pickle


BASE_PATH = "../base_completa/boatos_br_corpus.json"
vector_len = 500
parte_teste = 0.1
filename_save = f"./word2VecModels_Multinomial/modelo_word2vec_{vector_len}.model"
model_filesave = f"./word2VecModels_Multinomial/modelo_w2v_{parte_teste}"

In [39]:
database = pd.read_json(BASE_PATH)
database.shape

(3351, 11)

In [40]:
textos, rotulos = database['texto-normalizado'], database['rotulo']

In [41]:
def obterTokens(texto):
    tokens = nltk.word_tokenize(texto)
    return tokens

textos = textos.apply(obterTokens)
rotulos = rotulos.map({'falso':0, 'verdade':1})

In [42]:
w2v_model = Word2Vec(textos, vector_size=vector_len, window=5, min_count=1, workers=4)

In [43]:
w2v_model.save(filename_save)

In [44]:
w2v_model = Word2Vec.load(filename_save)

In [45]:
# Teste
similares = w2v_model.wv.most_similar('bolsonaro')
similares

[('orban', 0.9657760262489319),
 ('michelle', 0.9653307795524597),
 ('messias', 0.9593061208724976),
 ('jair', 0.9565527439117432),
 ('ajudante', 0.9517130255699158),
 ('enjoei', 0.9507244229316711),
 ('ex', 0.9482576847076416),
 ('explícito', 0.9475612640380859),
 ('benazir', 0.9445979595184326),
 ('administraçãojean', 0.9410309791564941)]

In [46]:
# Calcular média das palavras que compoem o texto
def vectorize(text):
    # Remove palavras que não estão no vocabulário do Word2Vec
    text = [word for word in text if word in w2v_model.wv]
    if len(text) == 0:
        return np.zeros(vector_len)  # Retorna vetor de zeros se nenhuma palavra do documento estiver no vocabulário
    return np.mean(w2v_model.wv[text], axis=0)

text_mean = [vectorize(text) for text in textos]

In [47]:
from sklearn.preprocessing import MinMaxScaler

# Escalonamento min-max para garantir que todos os valores sejam não-negativos
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(text_mean)

# Verificar se todos os valores são não-negativos
print(X_scaled.min(), X_scaled.max())

0.0 1.0000000000000002


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, rotulos, test_size=parte_teste, random_state=42)

In [49]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [50]:
pickle.dump(model,open(model_filesave,'wb'))

In [51]:
with open(model_filesave, 'rb') as tm:
    model = pickle.load(tm)

In [52]:
y_pred = model.predict(X_test)

In [53]:
# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [54]:
print(report)

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       214
           1       0.71      0.67      0.69       122

    accuracy                           0.78       336
   macro avg       0.76      0.76      0.76       336
weighted avg       0.78      0.78      0.78       336



In [55]:
print(accuracy)

0.7797619047619048
