In [26]:
import csv
import nltk
import pandas as pd

from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [27]:
# Load dataset
dataset = pd.read_csv('dataset_comentarios.csv', delimiter=';')

In [28]:
dataset

Unnamed: 0,ide_formulario_publicado,nom_titulo_formulario_publicado,des_conteudo,qtd_curtidas,qtd_descurtidas,dat_posicionamento,cod_autorizado
0,361682,PL 11098/2018,Não entendi muito bem a proposta.,0,0,2018-12-04 10:49:55,1
1,44036,PL 7180/2014,Acabar com doutrinação nas escolas,63,38,2018-12-04 10:51:56,1
2,44036,PL 7180/2014,Doutrinar crianças,17,35,2018-12-04 10:51:56,1
3,44036,PL 7180/2014,Professores têm que exercer a liberdade de Cát...,36,35,2018-12-04 10:55:44,1
4,360234,PL 10996/2018,privatiza de forma simplista o setor de saneam...,2,1,2018-12-04 11:05:08,1
...,...,...,...,...,...,...,...
381342,522936,PL 5411/2020,Vacinas experimentais. Sem consentimento ou an...,0,0,2021-07-08 11:01:16,0
381343,529652,INC 373/2021,Maior economia,0,0,2021-07-08 11:08:12,0
381344,395685,PL 533/2019,Enquanto universitário do curso de direito (UF...,0,0,2021-07-08 11:11:11,0
381345,529652,INC 373/2021,Muito bom pra carreira dos militares\n,0,0,2021-07-08 11:11:20,0


In [29]:
sentences = list(dataset.head(10000)['des_conteudo'])

# Creating a vocabulary

In [30]:
# NLTK portuguese stopwords set
stop_words = set(stopwords.words('portuguese'))
stop_words.remove("não")

In [31]:
words = set()

for sentence in tqdm(sentences):
    if isinstance(sentence, str):
        cur_words = [word.lower() 
                     for word in 
                     nltk.word_tokenize(sentence) if 
                     (word.isalnum() and word.lower() not in stop_words)]
    words.update(cur_words)

100%|██████████| 10000/10000 [00:03<00:00, 2637.69it/s]


In [32]:
with open('vocabulary.txt', 'w') as vocabulary:
    for word in words:
        vocabulary.write(word)
        vocabulary.write('\n')

# Creating a .tsv

In [33]:
ren_sentences = []

for sentence in tqdm(sentences):
    if (isinstance(sentence, str)):
        cur_words = [word.lower() 
                     for word in 
                     nltk.word_tokenize(sentence) if 
                     (word.isalnum() and word.lower() not in stop_words)]
        
        new_sentence = ' '.join(cur_words)
        if (new_sentence.strip() != ""):
            ren_sentences.append(new_sentence)

100%|██████████| 10000/10000 [00:03<00:00, 2696.94it/s]


In [34]:
# Split train, test and validation
X_train, X_rem = train_test_split(ren_sentences, train_size=0.7)
X_test, X_val = train_test_split(X_rem, test_size=0.5)

In [35]:
# Add labels
train_list = [[sentence, 'train'] for sentence in X_train]
test_list = [[sentence, 'test'] for sentence in X_test]
val_list = [[sentence, 'val'] for sentence in X_val]

# Concat lists
concat_list = train_list + test_list + val_list

In [36]:
# Create Dataframe
corpus_df = pd.DataFrame(concat_list, columns=['text', 'train']).replace('"','', regex=True)
corpus_df.to_csv('corpus.tsv', sep='\t', index=False, header=False)
corpus_df

Unnamed: 0,text,train
0,disparate total,train
1,pontos positivos proposta atinge direitos trab...,train
2,irá possibilitar profissionais despreparado ex...,train
3,exame importante requerer profissional direito...,train
4,todos deviam fazer prova valor justo nota fica...,train
...,...,...
9988,diminuir totalmente contrário aumento jornada ...,val
9989,colegas faculdade não condições passar graduaç...,val
9990,não ponto positivo,val
9991,destruição total fauna flora satisfazer ego ge...,val
