# AVISO!

Recomendamos muito consultar a versão no github, onde os arquivos estão 
organizados corretamente. Além disso, alguns componentes de código encontram-se
em arquivos `.py`.

Link: <https://github.com/DanielHLelis/Neural-Nets-TP2>

## Authors

- D. H. Lelis - 12543822
- Samuel Veronez - 12542626
- Fernando Campos - 12542352


# Exploring the Datasets

In [1]:
import os

import polars as pl
import plotly.express as px

from IPython.display import display

## Compress original data

In [2]:
ROOT_PATH = '../'

try:
    _olid_train = pl.read_csv(os.path.join(ROOT_PATH, "data/olid-br/train.csv"))
    _olid_train.write_parquet(os.path.join(ROOT_PATH, "data/olid-br/train.parquet.zstd"), compression="zstd", compression_level=9)

    _olid_test = pl.read_csv(os.path.join(ROOT_PATH, "data/olid-br/test.csv"))
    _olid_test.write_parquet(os.path.join(ROOT_PATH, "data/olid-br/test.parquet.zstd"), compression="zstd", compression_level=9)

    _told = pl.read_csv(os.path.join(ROOT_PATH, "data/told-br/told-br.csv"))
    _told.write_parquet(os.path.join(ROOT_PATH, "data/told-br/told-br.parquet.zstd"), compression="zstd", compression_level=9)
except:
    pass

## OLID-BR

### OLID-BR: Data Loading

In [3]:
olidbr_train = pl.read_parquet(os.path.join(ROOT_PATH, "data/olid-br/train.parquet.zstd"))
olidbr_test = pl.read_parquet(os.path.join(ROOT_PATH, "data/olid-br/test.parquet.zstd"))

In [4]:
# Join the train and test datasets
olidbr = pl.concat([olidbr_train, olidbr_test])

# Display 5 random samples
olidbr.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False


### OLID-BR: Data Distribution

In [5]:
print("OLID-BR Train")
print(olidbr_train['is_offensive'].value_counts())

print("OLID-BR Test")
print(olidbr_test['is_offensive'].value_counts())

print("OLID-BR Full")
print(olidbr['is_offensive'].value_counts())


OLID-BR Train
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 762   │
│ OFF          ┆ 4452  │
└──────────────┴───────┘
OLID-BR Test
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 254   │
│ OFF          ┆ 1484  │
└──────────────┴───────┘
OLID-BR Full
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 1016  │
│ OFF          ┆ 5936  │
└──────────────┴───────┘


In [6]:
px.histogram(
    olidbr, 
    x='is_offensive', 
    color='is_offensive', 
    title='OLID-BR Offensive Distribution', 
    template='plotly_dark',
).show()

### Reclassify Offensiveness

In [7]:
# Recompiling the dataset with the sum of all offensive categories
olidbr_recompiled = olidbr.with_columns([
    pl.sum_horizontal(
        pl.col("profanity_obscene"), 
        pl.col("health"), 
        pl.col("ideology"), 
        pl.col("insult"), 
        pl.col("lgbtqphobia"), 
        pl.col("other_lifestyle"), 
        pl.col("physical_aspects"), 
        pl.col("racism"), 
        pl.col("religious_intolerance"), 
        pl.col("sexism"), 
        pl.col("xenophobia"),
    ).alias("offensive"),
    pl.sum_horizontal(
        pl.col("health"), 
        pl.col("ideology"), 
        pl.col("insult"), 
        pl.col("lgbtqphobia"), 
        pl.col("other_lifestyle"), 
        pl.col("physical_aspects"), 
        pl.col("racism"), 
        pl.col("religious_intolerance"), 
        pl.col("sexism"), 
        pl.col("xenophobia"),
    ).alias("offensive_allow_profane")
])

olidbr_recompiled = olidbr_recompiled.with_columns(
    [
        pl.when(pl.col("offensive") > 0).then(True).otherwise(False).alias("offensive_discrete"),
        pl.when(pl.col("offensive_allow_profane") > 0).then(True).otherwise(False).alias("offensive_allow_profane_discrete"),
    ]
)

In [8]:
olidbr_recompiled.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia,offensive,offensive_allow_profane,offensive_discrete,offensive_allow_profane_discrete
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,u32,u32,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False,0,0,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True


In [9]:
print("OLID-BR Recompiled")
print(olidbr_recompiled['offensive_discrete'].value_counts())

print("OLID-BR Recompiled (Allow Profane)")
print(olidbr_recompiled['offensive_allow_profane_discrete'].value_counts())

OLID-BR Recompiled
shape: (2, 2)
┌────────────────────┬───────┐
│ offensive_discrete ┆ count │
│ ---                ┆ ---   │
│ bool               ┆ u32   │
╞════════════════════╪═══════╡
│ false              ┆ 1027  │
│ true               ┆ 5925  │
└────────────────────┴───────┘
OLID-BR Recompiled (Allow Profane)
shape: (2, 2)
┌─────────────────────────────────┬───────┐
│ offensive_allow_profane_discre… ┆ count │
│ ---                             ┆ ---   │
│ bool                            ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ true                            ┆ 5680  │
│ false                           ┆ 1272  │
└─────────────────────────────────┴───────┘


#### An overview of the balance of the dataset

As we can see, the dataset is very unbalanced towards offensive comments. 
Because of such, we might merge it with other datasets for a better balance.

## ToLD-Br

### ToLD-Br: Data Loading

FYI, the data is classified by offense type, with the values from 0 to 3,
indicating the number of times the comment was classified as offensive.


In [10]:
toldbr = pl.read_parquet(os.path.join(ROOT_PATH, "data/told-br/told-br.parquet.zstd"))

In [11]:
toldbr.sample(5, seed=42)

text,homophobia,obscene,insult,racism,misogyny,xenophobia
str,f64,f64,f64,f64,f64,f64
"""8 demais porra""",0.0,2.0,0.0,0.0,0.0,0.0
"""rt @user @user felizmente os s…",0.0,0.0,0.0,0.0,0.0,0.0
"""caralho tandara #voleinaglobo …",0.0,1.0,0.0,0.0,0.0,0.0
"""vamo c calma mo nosso filho v…",0.0,0.0,0.0,0.0,0.0,0.0
"""que puta sentimento lixo. quer…",0.0,0.0,0.0,0.0,0.0,0.0


### ToLD-Br: Data Distribution

In [12]:
px.histogram(
    toldbr,
    x=['homophobia', 'obscene', 'insult', 'racism', 'misogyny', 'xenophobia'],
    title='ToLD-Br Category Classification Distribution', 
    template='plotly_dark',
).show()

In [13]:
toldbr_off = toldbr.with_columns([
    pl.sum_horizontal(
        pl.col('homophobia'), 
        pl.col('obscene'), 
        pl.col('insult'), 
        pl.col('racism'), 
        pl.col('misogyny'), 
        pl.col('xenophobia'),
    ).alias('offensive'),
])

In [14]:
toldbr_off.sample(5, seed=42)

text,homophobia,obscene,insult,racism,misogyny,xenophobia,offensive
str,f64,f64,f64,f64,f64,f64,f64
"""8 demais porra""",0.0,2.0,0.0,0.0,0.0,0.0,2.0
"""rt @user @user felizmente os s…",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""caralho tandara #voleinaglobo …",0.0,1.0,0.0,0.0,0.0,0.0,1.0
"""vamo c calma mo nosso filho v…",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""que puta sentimento lixo. quer…",0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
px.histogram(
    toldbr_off,
    x='offensive',
    color='offensive',
    title='ToLD-Br Offensive Distribution', 
    template='plotly_dark',
).show()

In [16]:
toldbr_class_threshold = 1

toldbr_discrete = toldbr_off.with_columns([
    pl.when(pl.col('offensive') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_offensive_low'),
    pl.when(pl.col('homophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_homophobia_low'),
    pl.when(pl.col('obscene') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_obscene_low'),
    pl.when(pl.col('insult') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_insult_low'),
    pl.when(pl.col('racism') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_racism_low'),
    pl.when(pl.col('misogyny') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_misogyny_low'),
    pl.when(pl.col('xenophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_xenophobia_low'),
])

toldbr_class_threshold = 2
toldbr_discrete = toldbr_discrete.with_columns([
    pl.when(pl.col('offensive') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_offensive_high'),
    pl.when(pl.col('homophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_homophobia_high'),
    pl.when(pl.col('obscene') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_obscene_high'),
    pl.when(pl.col('insult') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_insult_high'),
    pl.when(pl.col('racism') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_racism_high'),
    pl.when(pl.col('misogyny') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_misogyny_high'),
    pl.when(pl.col('xenophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_xenophobia_high'),
]).select([
    'text', 'is_offensive_low', 
    'is_homophobia_low', 'is_obscene_low', 'is_insult_low', 
    'is_racism_low', 'is_misogyny_low', 'is_xenophobia_low',
    'is_offensive_high', 'is_homophobia_high', 'is_obscene_high', 
    'is_insult_high', 'is_racism_high', 'is_misogyny_high', 'is_xenophobia_high',
])


In [17]:
toldbr_discrete.sample(5, seed=42)

text,is_offensive_low,is_homophobia_low,is_obscene_low,is_insult_low,is_racism_low,is_misogyny_low,is_xenophobia_low,is_offensive_high,is_homophobia_high,is_obscene_high,is_insult_high,is_racism_high,is_misogyny_high,is_xenophobia_high
str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""8 demais porra""",True,False,True,False,False,False,False,True,False,True,False,False,False,False
"""rt @user @user felizmente os s…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""caralho tandara #voleinaglobo …",True,False,True,False,False,False,False,False,False,False,False,False,False,False
"""vamo c calma mo nosso filho v…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""que puta sentimento lixo. quer…",False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
print("ToLD-BR (At least 1 flag)")
print(toldbr_discrete['is_offensive_low'].value_counts())

print("ToLD-BR (At least 2 flags)")
print(toldbr_discrete['is_offensive_high'].value_counts())

ToLD-BR (At least 1 flag)
shape: (2, 2)
┌──────────────────┬───────┐
│ is_offensive_low ┆ count │
│ ---              ┆ ---   │
│ bool             ┆ u32   │
╞══════════════════╪═══════╡
│ false            ┆ 11745 │
│ true             ┆ 9255  │
└──────────────────┴───────┘
ToLD-BR (At least 2 flags)
shape: (2, 2)
┌───────────────────┬───────┐
│ is_offensive_high ┆ count │
│ ---               ┆ ---   │
│ bool              ┆ u32   │
╞═══════════════════╪═══════╡
│ true              ┆ 4816  │
│ false             ┆ 16184 │
└───────────────────┴───────┘


In [19]:
px.histogram(
    toldbr_discrete,
    x='is_offensive_low',
    color='is_offensive_low',
    title='ToLD-Br Offensive Distribution (Discrete Low)', 
    template='plotly_dark',
).show()

px.histogram(
    toldbr_discrete,
    x='is_offensive_high',
    color='is_offensive_high',
    title='ToLD-Br Offensive Distribution (Discrete High)', 
    template='plotly_dark',
).show()

#### An overview of the balance of the dataset

This dataset is way more balanced, when compared to OLID-BR. But, there is a
slight issue with the criteria, it seems. There aren't filtering for profanity
and it seems like one of the labelers considered profanit as obscene, while
the others, not.

When going for a stricter filter, we might use the "Low" variant, while for a
more relaxed one, we might use the "High" variant.

## Simplifying the datasets

In [20]:
olidbr_recompiled.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia,offensive,offensive_allow_profane,offensive_discrete,offensive_allow_profane_discrete
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,u32,u32,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False,0,0,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True


In [21]:
toldbr_discrete.sample(5, seed=42)

text,is_offensive_low,is_homophobia_low,is_obscene_low,is_insult_low,is_racism_low,is_misogyny_low,is_xenophobia_low,is_offensive_high,is_homophobia_high,is_obscene_high,is_insult_high,is_racism_high,is_misogyny_high,is_xenophobia_high
str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""8 demais porra""",True,False,True,False,False,False,False,True,False,True,False,False,False,False
"""rt @user @user felizmente os s…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""caralho tandara #voleinaglobo …",True,False,True,False,False,False,False,False,False,False,False,False,False,False
"""vamo c calma mo nosso filho v…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""que puta sentimento lixo. quer…",False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [22]:
toldbr_final = toldbr_discrete.select([
    pl.lit('ToLD-Br').alias('dataset'),
    pl.col('text').hash().alias('id').cast(pl.String),
    pl.col('text').str.replace_all('@user', 'USER'), # Replace @user with USER, to match OLID-BR
    pl.col('is_offensive_low').alias('off_strict'),
    pl.col('is_offensive_high').alias('off_relaxed'),
])
toldbr_final.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""ToLD-Br""","""12559603335008373494""","""8 demais porra""",True,True
"""ToLD-Br""","""1455693542618679752""","""rt USER USER felizmente os sol…",False,False
"""ToLD-Br""","""17500462964768468320""","""caralho tandara #voleinaglobo …",True,False
"""ToLD-Br""","""2818557597701017591""","""vamo c calma mo nosso filho v…",False,False
"""ToLD-Br""","""14538197625843834331""","""que puta sentimento lixo. quer…",False,False


In [23]:
olidbr_final = olidbr_recompiled.select([
    pl.lit('OLID-Br').alias('dataset'),
    pl.col('id').cast(pl.String),
    'text',
    pl.col('offensive_discrete').alias('off_strict'),
    pl.col('offensive_allow_profane_discrete').alias('off_relaxed'),
])
olidbr_final.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""OLID-Br""","""92882c210108475ba3af23a2a10a4e…","""Muita merda""",True,True
"""OLID-Br""","""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…",True,True
"""OLID-Br""","""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…",True,True
"""OLID-Br""","""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…",False,False
"""OLID-Br""","""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…",True,True


In [24]:
final_dataset = pl.concat([toldbr_final, olidbr_final])
final_dataset.sample(5, seed=534)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""ToLD-Br""","""9322321234464027338""","""pérola tá terrível , ela se ac…",True,False
"""ToLD-Br""","""17106631069393288087""","""USER parabéns pela coragem pq …",True,True
"""ToLD-Br""","""12766216725703474573""","""rt USER porra eu chorei demais…",True,False
"""OLID-Br""","""88aacb3db46a4ff8a70b563cc9e50f…","""USER Evoluiu...antes comia tra…",True,True
"""OLID-Br""","""878d1939c62e4658a6697d20535dcb…","""O presidente é quem manda no p…",True,True


In [None]:
# Write the dataset
os.makedirs(os.path.join(ROOT_PATH, "data/joint"), exist_ok=True)
final_dataset.write_parquet(os.path.join(ROOT_PATH, "data/joint/data.parquet.zstd"), compression="zstd", compression_level=9)

# Dataset Preprocessing

In [38]:
import os
import sys
import re

import polars as pl
from tqdm.notebook import tqdm
from IPython.display import clear_output

import nltk
import spacy
import unidecode

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

## Load Dataset

In [3]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'data.parquet.zstd'))

## Setup Toolkits

In [7]:
nltk.download('punkt')
spacy.cli.download('pt_core_news_sm')
clear_output()

In [30]:
nlp = spacy.load('pt_core_news_sm')

## Normalize

In [39]:
cleanup_re = re.compile(r'[\W\s]')
remove_double_spaces_re = re.compile(r'\s+')

base_clean = []
base_clean_lower = []
tokenized = []
lemmatized = []
no_accents = []
lemma_no_accents = []
no_stop_words = []
lemma_no_stop_words = []
no_stop_words_no_accents = []
lemma_no_stop_words_no_accents = []


# TODO: generalize each pre-processing approach into a separate function
# in a separate file.
for row in tqdm(df.iter_rows(named=True), total=len(df)):
    text: str = row['text']
    # Remove bad characters
    text = cleanup_re.sub(' ', text)
    text = remove_double_spaces_re.sub(' ', text)
    text = text.strip()
    base_clean.append(text)

    # Lowercase
    text = text.lower()
    base_clean_lower.append(text)

    # Tokenize
    # TODO: go deeper into tokenization
    tokens = [token for token in nlp(text)]
    tokenized.append([token.text for token in tokens])

    # Lemmatized
    lemmatized.append([token.lemma_ for token in tokens])

    # No accents
    no_accents.append([unidecode.unidecode(token.text) for token in tokens])
    lemma_no_accents.append([unidecode.unidecode(token.lemma_) for token in tokens])

    # No stop words
    no_stop_words.append([token.text for token in tokens if not token.is_stop])
    lemma_no_stop_words.append([token.lemma_ for token in tokens if not token.is_stop])

    # No stop words, no accents
    no_stop_words_no_accents.append([unidecode.unidecode(token.text) for token in tokens if not token.is_stop])
    lemma_no_stop_words_no_accents.append([unidecode.unidecode(token.lemma_) for token in tokens if not token.is_stop])



df_ext = df.with_columns([
    pl.Series('base_clean', base_clean),
    pl.Series('base_clean_lower', base_clean_lower),
    pl.Series('tokenized', tokenized),
    pl.Series('lemmatized', lemmatized),
    pl.Series('no_accents', no_accents),
    pl.Series('lemma_no_accents', lemma_no_accents),
    pl.Series('no_stop_words', no_stop_words),
    pl.Series('lemma_no_stop_words', lemma_no_stop_words),
    pl.Series('no_stop_words_no_accents', no_stop_words_no_accents),
    pl.Series('lemma_no_stop_words_no_accents', lemma_no_stop_words_no_accents)
])

  0%|          | 0/27952 [00:00<?, ?it/s]

In [41]:
df_ext.write_parquet(os.path.join(ROOT_PATH, "data/joint/pre_processed_data.parquet.zstd"), compression="zstd", compression_level=9)

In [1]:
import os
import sys
import json
import random
from datetime import datetime

import numpy as np
import polars as pl

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader, Dataset

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:

# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.bow.training import trainer, validate
from toxicity.bow.model import BoWModel, BoWDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1.8e-05
POS_WEIGHT = 1.663


OCC_TRESHOLD = 10
OTHER_TOKEN = '[OTHER]'
MAX_LEN = 256

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


## Setup Vocabulary for BoW

In [7]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bow', TIMESTAMP)
os.makedirs(MODEL_PATH, exist_ok=True)

# Count frequency of tokens
occurences = {}
for tokens in df['lemma_no_stop_words_no_accents']:
    for t in tokens:
        if t not in occurences:
            occurences[t] = 1
        else:
            occurences[t] += 1
            
# Sort by frequency
occurences = dict(sorted(occurences.items(), key=lambda item: item[1], reverse=True))

# Build vocabulary
vocabulary = []
for w, counter in occurences.items():
    if counter > OCC_TRESHOLD:
        vocabulary.append(w)

# Sort vocabulary and add a token for other words
vocabulary = sorted(vocabulary)
vocabulary.append(OTHER_TOKEN)

# Vocabulary lookup table
vocab_index = {}
for i, token in enumerate(vocabulary):
    vocab_index[token] = i

# Write vocabulary to file
with open(os.path.join(MODEL_PATH, 'vocab.txt'), 'w') as f:
    f.write('\n'.join(vocabulary))
with open(os.path.join(MODEL_PATH, 'vocab_index.json'), 'w') as f:
    json.dump(vocab_index, f)
with open(os.path.join(MODEL_PATH, 'vocab.json'), 'w') as f:
    json.dump(vocabulary, f)

print(f'Vocabulary size: {len(vocabulary)}')

Vocabulary size: 2832


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = BoWModel(len(vocabulary))
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)

train_loader = DataLoader(
    BoWDataset(train_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    BoWDataset(test_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

## Training

In [10]:
def validate_result():
    # Validate the results
    raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    print(f'Weighted F2: {model_metrics(fixed_targets, fixed_results)["weighted_f2"]:.6f}')

### Training Loop

In [11]:
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'bow', TIMESTAMP)

def epoch_callback(epoch, avg_loss):
    print('Validation Results:')
    validate_result()

train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.731525
Finished training epoch 1/8; Average Loss: 0.8197
Running training epoch 2/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.763015
Finished training epoch 2/8; Average Loss: 0.6500
Running training epoch 3/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.775978
Finished training epoch 3/8; Average Loss: 0.5659
Running training epoch 4/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.770893
Finished training epoch 4/8; Average Loss: 0.5175
Running training epoch 5/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.777609
Finished training epoch 5/8; Average Loss: 0.4775
Running training epoch 6/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.766436
Finished training epoch 6/8; Average Loss: 0.4402
Running training epoch 7/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.771409
Finished training epoch 7/8; Average Loss: 0.4017
Running training epoch 8/8


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.766841
Finished training epoch 8/8; Average Loss: 0.3628


### Save model

In [12]:
torch.save(model, f'{MODEL_PATH}/model.pth')

In [1]:
import os
import sys
import json
import random
from datetime import datetime

import numpy as np
import polars as pl

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader, Dataset

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.embeddings.training import trainer, validate
from toxicity.embeddings.model import EmbeddingModel, EmbeddingDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 6
LEARNING_RATE = 1e-05
POS_WEIGHT = 1.663


EMBEDDING_FILE = os.path.join(ROOT_PATH, 'cbow_s100.txt')
EMBEDDING_NAME = 'cbow_s100'
MAX_LEN = 128

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:

def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


### Load Embeddings

In [7]:
EMBEDDING_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}')
os.makedirs(EMBEDDING_PATH, exist_ok=True)

emb_dim = None
token_count = None
embeddings = {}

if not os.path.exists(f'{EMBEDDING_PATH}/embeddings.parquet.zstd'):
    with open(EMBEDDING_FILE, 'r') as f:
        fl = f.readline()
        token_count, emb_dim = map(int, fl.split(' '))

        while line := f.readline():
            emb = line.split(' ')

            token = emb[0]
            values = [float(v) for v in emb[1:]]

            if emb_dim is None:
                emb_dim = len(values)
            elif emb_dim != len(values):
                raise ValueError('Inconsistent embedding length')

            embeddings[token] = values
    
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')
    embedding_df = pl.DataFrame({
        'token': list(embeddings.keys()),
        'embedding': list(embeddings.values())
    })
    embedding_df.write_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd', compression="zstd", compression_level=9)
else:
    embedding_df = pl.read_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd')
    # TODO: optimize this below
    embeddings = {row['token']: row['embedding'] for row in embedding_df.to_dicts()}
    emb_dim = len(embeddings[next(iter(embeddings))])
    token_count = len(embeddings)
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')


Embedding Length: 100
Embedding Vocab Size: 929606; Expected: 929606


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = EmbeddingModel(emb_dim, MAX_LEN)
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)

train_loader = DataLoader(
    EmbeddingDataset(train_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    EmbeddingDataset(test_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

## Training

In [10]:
def validate_result():
    # Validate the results
    raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    print(f'Weighted F2: {model_metrics(fixed_targets, fixed_results)["weighted_f2"]:.6f}')

### Training Loop

In [11]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)

def epoch_callback(epoch, avg_loss):
    print('Validation Results:')
    validate_result()

train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.590787
Finished training epoch 1/10; Average Loss: 0.8467
Running training epoch 2/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.674804
Finished training epoch 2/10; Average Loss: 0.7655
Running training epoch 3/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.691178
Finished training epoch 3/10; Average Loss: 0.7180
Running training epoch 4/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.675966
Finished training epoch 4/10; Average Loss: 0.6919
Running training epoch 5/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.689189
Finished training epoch 5/10; Average Loss: 0.6701
Running training epoch 6/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.692351
Finished training epoch 6/10; Average Loss: 0.6483
Running training epoch 7/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.679430
Finished training epoch 7/10; Average Loss: 0.6264
Running training epoch 8/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.688136
Finished training epoch 8/10; Average Loss: 0.6017
Running training epoch 9/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.684593
Finished training epoch 9/10; Average Loss: 0.5753
Running training epoch 10/10


  0%|          | 0/1398 [00:00<?, ?it/s]

Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F2: 0.689354
Finished training epoch 10/10; Average Loss: 0.5476


### Save model

In [12]:
torch.save(model, f'{MODEL_PATH}/model.pth')

# Neural Solution - Transformers: BERT

In [1]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.transformers.bertimbau_base import bert_tokenizer, BertDatasetBF16, BertModuleBF16
from toxicity.transformers.training import trainer, validate
from toxicity.training import train_epochs, model_metrics

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 11
LEARNING_RATE = 2e-05
POS_WEIGHT = 1.663

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""5508727285226739644""","""medo de ir pra um rolê de novo…",[0],[0],"""medo de ir pra um rolê de novo…","""medo de ir pra um rolê de novo…","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]"
"""ToLD-Br""","""16827841903506270139""","""https://t.co/2bs6oD330q Ele a…",[0],[0],"""https t co 2bs6oD330q Ele até …","""https t co 2bs6od330q ele até …","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]"
"""ToLD-Br""","""7641628880024884135""","""rt USER bruno fernandes assina…",[0],[0],"""rt USER bruno fernandes assina…","""rt user bruno fernandes assina…","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]"
"""ToLD-Br""","""16866242508514532033""","""tinha que ter jogado esse bran…",[1],[1],"""tinha que ter jogado esse bran…","""tinha que ter jogado esse bran…","[""tinha"", ""que"", … ""trem""]","[""ter"", ""que"", … ""tr""]","[""tinha"", ""que"", … ""trem""]","[""ter"", ""que"", … ""tr""]","[""tinha"", ""jogado"", … ""trem""]","[""ter"", ""jogar"", … ""tr""]","[""tinha"", ""jogado"", … ""trem""]","[""ter"", ""jogar"", … ""tr""]"
"""ToLD-Br""","""3068271252403811869""","""eu sou a pessoa certa no bairr…",[0],[0],"""eu sou a pessoa certa no bairr…","""eu sou a pessoa certa no bairr…","[""eu"", ""sou"", … ""errado""]","[""eu"", ""ser"", … ""errar""]","[""eu"", ""sou"", … ""errado""]","[""eu"", ""ser"", … ""errar""]","[""pessoa"", ""certa"", … ""errado""]","[""pessoa"", ""certo"", … ""errar""]","[""pessoa"", ""certa"", … ""errado""]","[""pessoa"", ""certo"", … ""errar""]"


In [7]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
display(train_df.head(5))
display(test_df.head(5))

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""OLID-Br""","""3d85473d1c4b4f86a78159f23d7746…","""USER merda, ridículo essa impo…",[1],[1],"""USER merda ridículo essa impos…","""user merda ridículo essa impos…","[""user"", ""merda"", … ""crianças""]","[""user"", ""merda"", … ""criança""]","[""user"", ""merda"", … ""criancas""]","[""user"", ""merda"", … ""crianca""]","[""user"", ""merda"", … ""crianças""]","[""user"", ""merda"", … ""criança""]","[""user"", ""merda"", … ""criancas""]","[""user"", ""merda"", … ""crianca""]"
"""OLID-Br""","""b344c5518f0d44688ed45cc4a3183e…","""USER espero que eles sejam mor…",[1],[1],"""USER espero que eles sejam mor…","""user espero que eles sejam mor…","[""user"", ""espero"", … ""novamente""]","[""user"", ""esperar"", … ""novamente""]","[""user"", ""espero"", … ""novamente""]","[""user"", ""esperar"", … ""novamente""]","[""user"", ""espero"", … ""novamente""]","[""user"", ""esperar"", … ""novamente""]","[""user"", ""espero"", … ""novamente""]","[""user"", ""esperar"", … ""novamente""]"
"""ToLD-Br""","""4335543317461660187""","""eu tenho essas paran贸ias de ac…",[0],[0],"""eu tenho essas paran贸ias de ac…","""eu tenho essas paran贸ias de ac…","[""eu"", ""tenho"", … ""vivo""]","[""eu"", ""ter"", … ""vivo""]","[""eu"", ""tenho"", … ""vivo""]","[""eu"", ""ter"", … ""vivo""]","[""paran贸ias"", ""achar"", … ""vivo""]","[""paran贸ia"", ""achar"", … ""vivo""]","[""paranMao ias"", ""achar"", … ""vivo""]","[""paranMao ia"", ""achar"", … ""vivo""]"
"""OLID-Br""","""7ada9be164434f0e925f50616b637c…","""USER USER é USER USER""",[0],[0],"""USER USER é USER USER""","""user user é user user""","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]","[""user"", ""user"", … ""user""]"
"""ToLD-Br""","""16784738693255454158""","""meu pai me deu esse perfume eu…",[1],[0],"""meu pai me deu esse perfume eu…","""meu pai me deu esse perfume eu…","[""meu"", ""pai"", … ""aasacy52xy""]","[""meu"", ""pai"", … ""aasacy52xyr""]","[""meu"", ""pai"", … ""aasacy52xy""]","[""meu"", ""pai"", … ""aasacy52xyr""]","[""pai"", ""deu"", … ""aasacy52xy""]","[""pai"", ""dar"", … ""aasacy52xyr""]","[""pai"", ""deu"", … ""aasacy52xy""]","[""pai"", ""dar"", … ""aasacy52xyr""]"


dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""10657414299548058873""","""rt USER mano tá tudo me irrita…",[1],[0],"""rt USER mano tá tudo me irrita…","""rt user mano tá tudo me irrita…","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]","[""rt"", ""user"", … ""pqp""]"
"""ToLD-Br""","""11088205621966361413""","""USER horrível!""",[1],[0],"""USER horrível""","""user horrível""","[""user"", ""horrível""]","[""user"", ""horrível""]","[""user"", ""horrivel""]","[""user"", ""horrivel""]","[""user"", ""horrível""]","[""user"", ""horrível""]","[""user"", ""horrivel""]","[""user"", ""horrivel""]"
"""ToLD-Br""","""11546370057009176494""","""gnt como pode falar q esse hom…",[0],[0],"""gnt como pode falar q esse hom…","""gnt como pode falar q esse hom…","[""gnt"", ""como"", … ""jesus""]","[""gnt"", ""como"", … ""jesus""]","[""gnt"", ""como"", … ""jesus""]","[""gnt"", ""como"", … ""jesus""]","[""gnt"", ""falar"", … ""jesus""]","[""gnt"", ""falar"", … ""jesus""]","[""gnt"", ""falar"", … ""jesus""]","[""gnt"", ""falar"", … ""jesus""]"
"""ToLD-Br""","""9450469262872738701""","""Que foda o USER PUTA QUE PARIU…",[0],[0],"""Que foda o USER PUTA QUE PARIU…","""que foda o user puta que pariu…","[""que"", ""foda"", … ""3tv8fum5v8""]","[""que"", ""fodar"", … ""3tv8fum5v8""]","[""que"", ""foda"", … ""3tv8fum5v8""]","[""que"", ""fodar"", … ""3tv8fum5v8""]","[""foda"", ""user"", … ""3tv8fum5v8""]","[""fodar"", ""user"", … ""3tv8fum5v8""]","[""foda"", ""user"", … ""3tv8fum5v8""]","[""fodar"", ""user"", … ""3tv8fum5v8""]"
"""ToLD-Br""","""16835911729407698751""","""sapatão é foda, não pode beber…",[1],[1],"""sapatão é foda não pode beber …","""sapatão é foda não pode beber …","[""sapatão"", ""é"", … ""hein""]","[""sapatão"", ""ser"", … ""hein""]","[""sapatao"", ""e"", … ""hein""]","[""sapatao"", ""ser"", … ""hein""]","[""sapatão"", ""foda"", … ""hein""]","[""sapatão"", ""foda"", … ""hein""]","[""sapatao"", ""foda"", … ""hein""]","[""sapatao"", ""foda"", … ""hein""]"


## Setup Model

In [8]:
tokenizer = bert_tokenizer()

model = BertModuleBF16(feature_count=1)
model.to(PYTORCH_DEVICE)

train_loader = DataLoader(BertDatasetBF16(data_frame=train_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), shuffle=True,
                          num_workers=0, batch_size=TRAIN_BATCH_SIZE)
test_loader = DataLoader(BertDatasetBF16(data_frame=test_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), shuffle=True,
                         num_workers=0, batch_size=TEST_BATCH_SIZE)

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks, with good known results for BERT models.

In [9]:
loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

### Training

### Result Validation

In [10]:
def validate_result():
    # Validate the results
    raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.75
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    model_metrics(fixed_targets, fixed_results, print_metrics=True)

### Train Model

In [11]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'bertimbau-bf16', TIMESTAMP)

def epoch_callback(epoch, avg_loss):
    print(f'Epoch: {epoch}, Avg Loss: {avg_loss}')
    print('Validation Results:')
    validate_result()

train_epochs(
    trainer, EPOCHS, model, train_loader, loss_function, optimizer, 
    PYTORCH_DEVICE, checkpoint_path=CHECKPOINT_PATH, 
    epoch_callback=epoch_callback)

Running training epoch 1/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 0, Avg Loss: 0.6648846845940629
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.7808855119484294
Macro F1: 0.7597306124453612
Weighted F2: 0.7834963503327017
Macro F2: 0.7526229414239014
Accuracy: 0.7871579323913432
Recall: 0.7871579323913432
Precision: 0.7850872356435311
Finished training epoch 1/11; Average Loss: 0.6649
Running training epoch 2/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 1, Avg Loss: 0.5711151254023605
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.7937684863091143
Macro F1: 0.7751216966773498
Weighted F2: 0.7955876195593297
Macro F2: 0.7695488841877091
Accuracy: 0.7978894652119478
Recall: 0.7978894652119478
Precision: 0.7954273214436385
Finished training epoch 2/11; Average Loss: 0.5711
Running training epoch 3/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 2, Avg Loss: 0.5388885629023605
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8035494299634817
Macro F1: 0.7877716987792134
Weighted F2: 0.8043243726796876
Macro F2: 0.785334601558002
Accuracy: 0.8050438204256841
Recall: 0.8050438204256841
Precision: 0.8030916291757776
Finished training epoch 3/11; Average Loss: 0.5389
Running training epoch 4/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 3, Avg Loss: 0.5223338027539342
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.7978701809688339
Macro F1: 0.7793924325274655
Weighted F2: 0.7997239709077235
Macro F2: 0.7733896876054954
Accuracy: 0.8021820783401896
Recall: 0.8021820783401896
Precision: 0.8000743302556642
Finished training epoch 4/11; Average Loss: 0.5223
Running training epoch 5/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 4, Avg Loss: 0.5112863521548641
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.7989394982844101
Macro F1: 0.7808048168507239
Weighted F2: 0.8006718151208367
Macro F2: 0.7751777658658172
Accuracy: 0.8028975138615633
Recall: 0.8028975138615633
Precision: 0.8006606224818901
Finished training epoch 5/11; Average Loss: 0.5113
Running training epoch 6/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 5, Avg Loss: 0.5026893888590844
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8058394660843908
Macro F1: 0.7892801007577063
Weighted F2: 0.807064553631426
Macro F2: 0.7851525219307643
Accuracy: 0.8084421391522089
Recall: 0.8084421391522089
Precision: 0.8061542468322681
Finished training epoch 6/11; Average Loss: 0.5027
Running training epoch 7/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 6, Avg Loss: 0.49746918030221743
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8079362159574037
Macro F1: 0.7914923914520624
Weighted F2: 0.8091720209086368
Macro F2: 0.7872174363893965
Accuracy: 0.8105884457163298
Recall: 0.8105884457163298
Precision: 0.8083704010976803
Finished training epoch 7/11; Average Loss: 0.4975
Running training epoch 8/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 7, Avg Loss: 0.49093152047567956
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8068744909234049
Macro F1: 0.790360981046454
Weighted F2: 0.8081100351117133
Macro F2: 0.7861415810455751
Accuracy: 0.8095152924342693
Recall: 0.8095152924342693
Precision: 0.8072614561502442
Finished training epoch 8/11; Average Loss: 0.4909
Running training epoch 9/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 8, Avg Loss: 0.4872376274141631
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8057437224863073
Macro F1: 0.7888129701686672
Weighted F2: 0.807133478403719
Macro F2: 0.7840544528536182
Accuracy: 0.8087998569128957
Recall: 0.8087998569128957
Precision: 0.8065641612004606
Finished training epoch 9/11; Average Loss: 0.4872
Running training epoch 10/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 9, Avg Loss: 0.48629634131348354
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8025186216461931
Macro F1: 0.7849253365667284
Weighted F2: 0.804110351845196
Macro F2: 0.7795831030722657
Accuracy: 0.8061169737077446
Recall: 0.8061169737077446
Precision: 0.803912273820282
Finished training epoch 10/11; Average Loss: 0.4863
Running training epoch 11/11


  0%|          | 0/1398 [00:00<?, ?it/s]

Epoch: 10, Avg Loss: 0.4798100243651645
Validation Results:


  0%|          | 0/350 [00:00<?, ?it/s]

Weighted F1: 0.8064705934966486
Macro F1: 0.78960319291827
Weighted F2: 0.8078529937995174
Macro F2: 0.7848304737345566
Accuracy: 0.8095152924342693
Recall: 0.8095152924342693
Precision: 0.8073069045478859
Finished training epoch 11/11; Average Loss: 0.4798


### Save the model and tokenizer

In [12]:
target_dir = os.path.join(ROOT_PATH, f'models/trained-bertimbau-bf16/{TIMESTAMP}')

os.makedirs(target_dir, exist_ok=True)

tokenizer.save_vocabulary(target_dir)
torch.save(model, f'{target_dir}/model.pth')

In [1]:
import re
import os
import sys
import json
import random

from IPython.display import clear_output

import numpy as np
import polars as pl

import spacy
import unidecode

import torch
from torch import cuda

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:

# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.bow.model import BoWModel, encode_tokens, OTHER_TOKEN

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Load Model & Vocabulary for BoW

In [6]:
TIMESTAMP = "20240616235959"
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bow', TIMESTAMP)

with open(os.path.join(MODEL_PATH, 'vocab_index.json'), 'r') as f:
    vocab_index = json.load(f)


model = torch.load(os.path.join(MODEL_PATH, 'model.pth'), map_location=PYTORCH_DEVICE)

### Setup Normalizer & Tokenizer

In [7]:
spacy.cli.download('pt_core_news_sm')
clear_output()
nlp = spacy.load('pt_core_news_sm')

In [8]:

def tokenize_text(text: str):
    cleanup_re = re.compile(r'[\W\s]')
    remove_double_spaces_re = re.compile(r'\s+')
    # Remove bad characters
    text = cleanup_re.sub(' ', text)
    text = remove_double_spaces_re.sub(' ', text)
    text = text.strip()

    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = [token for token in nlp(text)]

    # No stop words, no accents, lemma
    return [unidecode.unidecode(token.lemma_) for token in tokens if not token.is_stop]

## Run Model

In [9]:
FIXED_THRESHOLD = 0.75

def run_text(text: str) -> bool:
    tokens = tokenize_text(text)
    encoded = encode_tokens(tokens, vocab_index)

    model.eval()
    outs = model(torch.tensor(encoded, dtype=torch.float32).unsqueeze(0).to(PYTORCH_DEVICE))

    result = torch.sigmoid(outs).to(torch.float32).cpu().detach().numpy().flatten()

    return result[0] > FIXED_THRESHOLD

In [10]:
run_text('salve galera, tudo bem?')

False

In [11]:
run_text('vai se fuder')

True

In [12]:
run_text('vocês são bem bobo')

False

In [1]:
import re
import os
import sys
import json
import random

from IPython.display import clear_output

import polars as pl
import numpy as np

import spacy

import torch
from torch import cuda

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:

# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.embeddings.model import EmbeddingModel, EmbeddingDataset, encode_tokens 

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777



print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Load Model & Vocabulary for BoW

In [6]:
TIMESTAMP = "20240617021617"
EMBEDDING_NAME = 'cbow_s100'
MODEL_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
MAX_LEN = 128

model = torch.load(os.path.join(MODEL_PATH, 'model.pth'), map_location=PYTORCH_DEVICE)

In [7]:

EMBEDDING_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}')

embedding_df = pl.read_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd')
embeddings = {row['token']: row['embedding'] for row in embedding_df.to_dicts()}
emb_dim = len(embeddings[next(iter(embeddings))])
token_count = len(embeddings)
print(f'Embedding Length: {emb_dim}')
print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')


Embedding Length: 100
Embedding Vocab Size: 929606; Expected: 929606


### Setup Normalizer & Tokenizer

In [8]:
spacy.cli.download('pt_core_news_sm')
clear_output()
nlp = spacy.load('pt_core_news_sm')

In [9]:

def tokenize_text(text: str):
    cleanup_re = re.compile(r'[\W\s]')
    remove_double_spaces_re = re.compile(r'\s+')
    # Remove bad characters
    text = cleanup_re.sub(' ', text)
    text = remove_double_spaces_re.sub(' ', text)
    text = text.strip()

    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = [token for token in nlp(text)]

    # No stop words
    return [token.lemma_ for token in tokens if not token.is_stop]

## Run Model

In [18]:
FIXED_THRESHOLD = 0.5

def run_text(text: str) -> bool:
    tokens = tokenize_text(text)
    encoded = encode_tokens(tokens, embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN)

    model.eval()
    outs = model(torch.tensor(encoded, dtype=torch.float32).unsqueeze(0).to(PYTORCH_DEVICE))

    result = torch.sigmoid(outs).to(torch.float32).cpu().detach().numpy().flatten()

    return result[0] > FIXED_THRESHOLD

In [19]:
run_text('salve galera, tudo bem?')

False

In [27]:
run_text('vocês são uns lixos')

False

In [23]:
run_text('vocês são bem bobo')

True

In [1]:
import os
import sys
import random

import numpy as np


import torch
from torch import cuda

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:

# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.transformers.bertimbau_base import bert_tokenizer, BertModuleBF16

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777



print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Load Model

In [6]:
TIMESTAMP = "20240617022148"
MAX_LEN = 256
MODEL_PATH = os.path.join(ROOT_PATH, f'models/trained-bertimbau-bf16/{TIMESTAMP}/model.pth')

model = torch.load(MODEL_PATH, map_location=PYTORCH_DEVICE)

## Run Model

In [7]:
FIXED_THRESHOLD = 0.75

tokenizer = bert_tokenizer()
def run_text(text: str) -> bool:
    tokens = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding="max_length",
        return_token_type_ids=False,
        truncation=True,
    )

    model.eval()
    outs = model(
        torch.tensor(tokens['input_ids'], dtype=torch.int64).unsqueeze(0).to(PYTORCH_DEVICE),
        torch.tensor(tokens['attention_mask'], dtype=torch.int64).unsqueeze(0).to(PYTORCH_DEVICE),
    )

    result = torch.sigmoid(outs).to(torch.float32).cpu().detach().numpy().flatten()

    return result[0] > FIXED_THRESHOLD

In [8]:
run_text('salve galera, tudo bem?')

False

In [9]:
run_text('vocês são uns lixos')

True

In [10]:
run_text('vocês são bem bobo')

True