In [1]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [10]:
import os
import sys
import random
from datetime import datetime

import pandas as pd
import numpy as np
import re
from unidecode import unidecode

from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import spacy

from tqdm.notebook import tqdm

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader,Dataset, TensorDataset

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [77]:
! python -m spacy download pt_core_news_sm > /dev/null

In [4]:
df = pd.read_parquet("data.parquet.zstd")
df

Unnamed: 0,dataset,id,text,off_strict,off_relaxed
0,ToLD-Br,17617711318335429284,Meu nivel de amizade com isis é ela ter meu in...,True,True
1,ToLD-Br,7288264196393788121,"rt USER USER o cara adultera dados, que foram ...",True,False
2,ToLD-Br,1519599024025873488,USER USER USER o cara só é simplesmente o maio...,True,True
3,ToLD-Br,4211815168063456011,eu to chorando vei vsf e eu nem staneio izone ...,True,False
4,ToLD-Br,1231484732880705337,Eleitor do Bolsonaro é tão ignorante q não per...,True,True
...,...,...,...,...,...
27947,OLID-Br,c4eb91336485416b83d3d9de5585c74d,Essa thayse é falsa credo 😂 😂 😂 😂 😂,True,True
27948,OLID-Br,f9e864f275d14bed9cd9e2ae8aed259d,Cara que tirar de contexto USER USER falou uma...,False,False
27949,OLID-Br,574c84477a0243eba5814c4eef41aad3,USER Fala que era da era Obama!!!!! Jornalismo...,True,True
27950,OLID-Br,682e423fda2542e9a914bcf58cb44477,"Existe pessoas que tem fetiche por pessoas ""su...",False,False


In [5]:
nlp = spacy.load("pt_core_news_sm")
lemma = lambda x : x.lemma_

# Normalização dos textos
for i,row in tqdm(df.iterrows(), total=df.shape[0]):
    temp =  row['text'].lower()

    # Remoção de vírgulas, aspas, emojis, quebras de linha
    temp = re.sub(r'[^\w\s]|[\n]', '',temp)
    # Lematização
    temp = " ".join(list(map(lemma, nlp(temp))))
    # Removeção de acentos, cedilhas e semelhantes
    temp = unidecode(temp)

    df.loc[i,'text'] = temp

  0%|          | 0/27952 [00:00<?, ?it/s]

In [11]:
OCC_TRESHOLD = 10
OTHER = '_other'
# Conta quantas vezes cada palavra apareceu
occurences = {}
for t in df['text']:
    words = word_tokenize(t)
    for w in words:
        if w not in occurences:
            occurences[w] = 1
        else:
            occurences[w] += 1

# Mantem apenas as palavras que aparecem mais vezes que o treshold
vocab = []
for w,counter in occurences.items():
    if counter > OCC_TRESHOLD:
        vocab.append(w)

vocab = sorted(vocab)
vocab.append(OTHER)

# Map de palavra para posição no vetor
vocab_index = {}
for i in range(len(vocab)):
    vocab_index[vocab[i]] = i

len(vocab)

2904

In [12]:
def vectorize(text, vocab_index):
    vector = np.zeros(len(vocab_index))

    for w in text.split():
        if w not in vocab_index:
            vector[vocab_index[OTHER]] += 1
        else:
            vector[vocab_index[w]] += 1

    return vector

print(sum(vectorize(df['text'][0],vocab_index)))

34.0


In [37]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-03

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [39]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [40]:
class Net(torch.nn.Module):
    def __init__(self,vocab_size):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(vocab_size,512,dtype=torch.float32)
        self.fc2 = nn.Linear(512,256,dtype=torch.float32)
        self.fc3 = nn.Linear(256,32,dtype=torch.float32)
        self.fc4 = nn.Linear(32,1,dtype=torch.float32)

    def forward(self,X):
        X = X.to(torch.float32)
        temp = torch.relu(self.fc1(X))
        temp = torch.relu(self.fc2(temp))
        temp = torch.relu(self.fc3(temp))
        return self.fc4(temp)


In [73]:
class IHHAAAAA(Dataset):
    def __init__(self,X,Y):
        self.X = X.reset_index()
        self.Y = Y.reset_index()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return {
            "X":torch.tensor(vectorize(self.X.iloc[item]['text'], vocab_index), dtype=torch.float32),
            "Y":torch.tensor([self.Y.iloc[item]['off_relaxed']],dtype=torch.float32)
        }


In [74]:
network = Net(len(vocab))
network.to(PYTORCH_DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(network.parameters(),lr=LEARNING_RATE,momentum=TRAIN_RATIO)

In [75]:
X_train,X_test, y_train, y_test = train_test_split(df['text'],df['off_relaxed'], train_size=.5, random_state=RANDOM_SEED)

train_dataloader = DataLoader(IHHAAAAA(X_train,y_train), shuffle=True,num_workers=0,batch_size=8)

In [76]:
N_EPOCHS = 40
for epoch in range(N_EPOCHS):
    epoch_loss = 0
    epoch_step = 0
    for data in tqdm(train_dataloader,total=len(train_dataloader)):
        optimizer.zero_grad()

        X = data['X'].to(PYTORCH_DEVICE)
        Y = data['Y'].to(PYTORCH_DEVICE)

        output = network(X)

        loss = criterion(output, Y)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_step += 1

    print("IHHHAAAAA", epoch_loss/epoch_step)
torch.save(network, './model.pth')

  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6707527434648483


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6586625494476585


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6549455878491667


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6477501127947789


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6364632752293782


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6239005742103083


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.6066707564580761


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.5834839604347176


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.5567844490545711


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.5318834718698492


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.5112000456198188


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.4926063466580445


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.47843664674739805


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.4608855121004015


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.44810500148626214


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.43219859783367626


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.4181363316509781


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.4014037476077411


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.390055499292812


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.3700368863407311


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.3577518195434168


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.3404956482527815


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.3214940371585697


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.3036319336597501


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.2863488377396574


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.269954370478681


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.24527493208800308


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.2339197991636561


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.21292009023505967


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.2038415647274428


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.18397422955474327


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.17185120960979008


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.1603069066253406


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.14403883186210284


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.13626302748147673


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.12476295713134297


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.1151362183248186


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.10783047870448703


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.10062872180709873


  0%|          | 0/1747 [00:00<?, ?it/s]

IHHHAAAAA 0.09638621709117604


In [78]:
test_dataloader = DataLoader(IHHAAAAA(X_test,y_test), shuffle=True,num_workers=0,batch_size=8)

In [79]:
actual_outputs = []
target_outputs = []

network.eval()

with torch.no_grad():
  for data in tqdm(test_dataloader,total=len(test_dataloader)):
      X = data['X'].to(PYTORCH_DEVICE)
      output = network(X)

      processed_targets = data["Y"].tolist()
      processed_outputs = (
          torch.sigmoid(output).cpu().detach().numpy().tolist()
      )

      actual_outputs.extend(processed_outputs)
      target_outputs.extend(processed_targets)

  0%|          | 0/1747 [00:00<?, ?it/s]

In [81]:
FIXED_THRESHOLD = 0.9
fixed_results = np.array(actual_outputs) > FIXED_THRESHOLD
fixed_targets = np.array(target_outputs) > FIXED_THRESHOLD

In [82]:
from sklearn.metrics import (
    f1_score, fbeta_score, accuracy_score, recall_score, precision_score)

fixed_weighted_f1 = f1_score(fixed_targets, fixed_results, average='weighted')
fixed_macro_f1 = f1_score(fixed_targets, fixed_results, average='macro')
fixed_weighted_f2 = fbeta_score(fixed_targets, fixed_results, beta=2, average='weighted')
fixed_macro_f2 = fbeta_score(fixed_targets, fixed_results, beta=2, average='macro')
fixed_accuracy = accuracy_score(fixed_targets, fixed_results)
fixed_recall = recall_score(fixed_targets, fixed_results, average='weighted')
fixed_precision = precision_score(fixed_targets, fixed_results, average='weighted')

print("Model Metrics:")
print(f"Weighted F1 = {fixed_weighted_f1:.6f}")
print(f"Macro F1 = {fixed_macro_f1:.6f}")
print(f"Weighted F2 Score = {fbeta_score(fixed_targets, fixed_results, beta=2, average='weighted'):.6f}")
print(f"Macro F2 Score = {fbeta_score(fixed_targets, fixed_results, beta=2, average='macro'):.6f}")
print(f"Accuracy = {fixed_accuracy:.6f}")
print(f"Recall = {fixed_recall:.6f}")
print(f"Precision = {fixed_precision:.6f}")

Model Metrics:
Weighted F1 = 0.707940
Macro F1 = 0.675379
Weighted F2 Score = 0.716772
Macro F2 Score = 0.666786
Accuracy = 0.730824
Recall = 0.730824
Precision = 0.737769
