# Bibliotecas e dados necessários

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install unidecode
from unidecode import unidecode

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import spacy
from spacy.lang.pt import stop_words
import random
import json
import os
import re
from itertools import chain
import sys
from __future__ import unicode_literals, print_function
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import base64
import pickle
import csv
csv.field_size_limit(sys.maxsize)

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
!pip install gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts



In [4]:
with open("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Dados/dataset_full.csv", 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    sentence_data = [row[1].split() for row in tqdm(csv_reader, desc="Carregando dataset", position=0, leave=True)]

Carregando dataset: 498219it [00:04, 109739.73it/s]


# Word2Vec

In [5]:
sentence_tokens = list(filter(lambda st: len(st) >= 3, sentence_data))

In [15]:
model = Word2Vec(sentences=sentence_tokens, vector_size=100, window=5, min_count=1, workers=4)

In [16]:
model_path = "/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Modelos/word2vec.model"
model.save(model_path)

In [21]:
print([x[0] for x in model.wv.most_similar("recif", topn=10)])

['caruaru', 'petrolin', 'nassau', 'atala', 'ssp', 'indianopol', 'sds', 'magan', 'iguarac', 'tuperatam']


# TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [7]:
tf_idf = TfidfVectorizer()
tf_idf_vec = tf_idf.fit_transform([" ".join(st) for st in sentence_tokens])

In [28]:
token_list = tf_idf.get_feature_names_out().tolist()
token_importance = np.array([np.average(tf_idf_vec.getcol(i).toarray()) for i in tqdm(range(len(token_list)))])
token_importance /= np.max(token_importance)

# Ordena os tokens por importância para facilitar a visualização
top_tokens = np.argsort(-token_importance)
token_importance_ord = [token_importance[i] for i in top_tokens]
token_list_ord = [token_list[i] for i in top_tokens]

100%|██████████| 40727/40727 [22:26<00:00, 30.24it/s]


In [29]:
with open("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Dados/tf_idf_weights.csv", 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    for i in range(len(token_importance_ord)):
        csv_writer.writerow([token_list_ord[i], token_importance_ord[i]])

In [32]:
print(token_list_ord[:10])

['municipal', 'contrat', 'public', 'secret', 'municipi', 'pernambuc', 'art', 'prefeit', 'port', 'valor']


# Combinando as técnicas

In [33]:
# Word2Vec
print("Carregando modelo Word2Vec...")
WORD2VEC = Word2Vec.load("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Modelos/word2vec.model")

# TF-IDF
tf_idf_weights = {}
with open("/content/drive/Shareddrives/IA 2023 - Projeto 1 Grupo 3/Dados/tf_idf_weights.csv", 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in tqdm(csv_reader, desc="Carregando tokens TF-IDF", position=0, leave=True):
        tf_idf_weights[row[0]] = float(row[1])

Carregando modelo Word2Vec...


Carregando tokens TF-IDF: 40727it [00:00, 625002.91it/s]


In [34]:
# Impede que algumas palavras muito repetitivas ainda sejam adicionadas
in_range = lambda v: v <= 1.0 and v > 0.001

# Remove acentos e passa para lowercase
formatar = lambda palavra: unidecode(str(palavra).lower())

STEMMER = SnowballStemmer("portuguese", ignore_stopwords=True)
STOPWORDS = stopwords.words('portuguese')

def extract_stem_tokens(sentence: str):
    tokens = [formatar(t) for t in word_tokenize(sentence)]
    important_tokens = list(filter(lambda t: not t in STOPWORDS and t.isalpha(), tokens))
    stems = [STEMMER.stem(t) for t in important_tokens]
    return stems

def vectorize_sentence(sentence: str):
    tokens = [t for t in extract_stem_tokens(sentence) if (t in WORD2VEC.wv) and (t in tf_idf_weights.keys())]
    vector = [WORD2VEC.wv[t] * tf_idf_weights[t] for t in tokens]
    if len(vector) > 0:
        return np.average(vector, axis=0)
    else:
        return None

def vectorize_tokens(text: str, size=20):
    return np.array(([WORD2VEC.wv[t] for t in extract_stem_tokens(text) if t in WORD2VEC.wv] + [np.zeros_like(WORD2VEC.wv[0])] * size)[:size])

In [35]:
vectorize_sentence("publicação importante do diário de pernambuco")

array([-1.04880214e-01, -3.10990423e-01,  3.69531065e-01,  4.81838912e-01,
        2.88921893e-01,  1.06111109e-01, -4.68635798e-01, -8.04245323e-02,
        9.60283577e-02,  1.04440376e-01, -4.37225401e-01,  6.00716114e-01,
       -4.25146192e-01,  2.08196983e-01,  9.75993872e-02,  1.65692002e-01,
        5.36801219e-01,  7.50557303e-01, -8.18136483e-02,  5.49748361e-01,
        4.26764280e-01, -8.65316838e-02,  8.90931487e-02, -1.06212175e+00,
       -4.88438070e-01,  7.09996223e-02,  7.43332386e-01,  5.06432652e-01,
       -4.70587522e-01,  2.60608017e-01, -2.94732898e-01,  2.44098395e-01,
        2.23716140e-01, -8.12888980e-01,  1.10117540e-01,  1.71697944e-01,
       -4.02891040e-01, -1.67427331e-01,  2.75570482e-01,  3.95593166e-01,
       -4.84801531e-02,  7.63033986e-01,  2.98607528e-01, -3.05602759e-01,
       -9.33259726e-05,  1.26543328e-01, -7.96566606e-02, -1.31846607e-01,
        3.36936742e-01, -1.05635069e-01, -1.24273017e-01,  3.56293529e-01,
        2.62537062e-01, -