# AVISO!

Recomendamos muito consultar a versão no github, onde os arquivos estão 
organizados corretamente. Além disso, alguns componentes de código encontram-se
em arquivos `.py`.

Link: <https://github.com/DanielHLelis/Neural-Nets-TP2>

## Authors

- D. H. Lelis - 12543822
- Samuel Veronez - 12542626
- Fernando Campos - 12542352

## Mudanças da Re-entrega

- Split entre teste e validação para o early stopping
- POS_WEIGHT baseado apenas nos dados de teste

# Exploring the Datasets

In [1]:
import os

import polars as pl
import plotly.express as px

from IPython.display import display

## Compress original data

In [2]:
ROOT_PATH = '../'

try:
    _olid_train = pl.read_csv(os.path.join(ROOT_PATH, "data/olid-br/train.csv"))
    _olid_train.write_parquet(os.path.join(ROOT_PATH, "data/olid-br/train.parquet.zstd"), compression="zstd", compression_level=9)

    _olid_test = pl.read_csv(os.path.join(ROOT_PATH, "data/olid-br/test.csv"))
    _olid_test.write_parquet(os.path.join(ROOT_PATH, "data/olid-br/test.parquet.zstd"), compression="zstd", compression_level=9)

    _told = pl.read_csv(os.path.join(ROOT_PATH, "data/told-br/told-br.csv"))
    _told.write_parquet(os.path.join(ROOT_PATH, "data/told-br/told-br.parquet.zstd"), compression="zstd", compression_level=9)
except:
    pass

## OLID-BR

### OLID-BR: Data Loading

In [3]:
olidbr_train = pl.read_parquet(os.path.join(ROOT_PATH, "data/olid-br/train.parquet.zstd"))
olidbr_test = pl.read_parquet(os.path.join(ROOT_PATH, "data/olid-br/test.parquet.zstd"))

In [4]:
# Join the train and test datasets
olidbr = pl.concat([olidbr_train, olidbr_test])

# Display 5 random samples
olidbr.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False


### OLID-BR: Data Distribution

In [5]:
print("OLID-BR Train")
print(olidbr_train['is_offensive'].value_counts())

print("OLID-BR Test")
print(olidbr_test['is_offensive'].value_counts())

print("OLID-BR Full")
print(olidbr['is_offensive'].value_counts())


OLID-BR Train
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ OFF          ┆ 4452  │
│ NOT          ┆ 762   │
└──────────────┴───────┘
OLID-BR Test
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ OFF          ┆ 1484  │
│ NOT          ┆ 254   │
└──────────────┴───────┘
OLID-BR Full
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 1016  │
│ OFF          ┆ 5936  │
└──────────────┴───────┘


In [6]:
px.histogram(
    olidbr, 
    x='is_offensive', 
    color='is_offensive', 
    title='OLID-BR Offensive Distribution', 
    template='plotly_dark',
).show()

### Reclassify Offensiveness

In [7]:
# Recompiling the dataset with the sum of all offensive categories
olidbr_recompiled = olidbr.with_columns([
    pl.sum_horizontal(
        pl.col("profanity_obscene"), 
        pl.col("health"), 
        pl.col("ideology"), 
        pl.col("insult"), 
        pl.col("lgbtqphobia"), 
        pl.col("other_lifestyle"), 
        pl.col("physical_aspects"), 
        pl.col("racism"), 
        pl.col("religious_intolerance"), 
        pl.col("sexism"), 
        pl.col("xenophobia"),
    ).alias("offensive"),
    pl.sum_horizontal(
        pl.col("health"), 
        pl.col("ideology"), 
        pl.col("insult"), 
        pl.col("lgbtqphobia"), 
        pl.col("other_lifestyle"), 
        pl.col("physical_aspects"), 
        pl.col("racism"), 
        pl.col("religious_intolerance"), 
        pl.col("sexism"), 
        pl.col("xenophobia"),
    ).alias("offensive_allow_profane")
])

olidbr_recompiled = olidbr_recompiled.with_columns(
    [
        pl.when(pl.col("offensive") > 0).then(True).otherwise(False).alias("offensive_discrete"),
        pl.when(pl.col("offensive_allow_profane") > 0).then(True).otherwise(False).alias("offensive_allow_profane_discrete"),
    ]
)

In [8]:
olidbr_recompiled.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia,offensive,offensive_allow_profane,offensive_discrete,offensive_allow_profane_discrete
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,u32,u32,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False,0,0,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True


In [9]:
print("OLID-BR Recompiled")
print(olidbr_recompiled['offensive_discrete'].value_counts())

print("OLID-BR Recompiled (Allow Profane)")
print(olidbr_recompiled['offensive_allow_profane_discrete'].value_counts())

OLID-BR Recompiled
shape: (2, 2)
┌────────────────────┬───────┐
│ offensive_discrete ┆ count │
│ ---                ┆ ---   │
│ bool               ┆ u32   │
╞════════════════════╪═══════╡
│ false              ┆ 1027  │
│ true               ┆ 5925  │
└────────────────────┴───────┘
OLID-BR Recompiled (Allow Profane)
shape: (2, 2)
┌─────────────────────────────────┬───────┐
│ offensive_allow_profane_discre… ┆ count │
│ ---                             ┆ ---   │
│ bool                            ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ true                            ┆ 5680  │
│ false                           ┆ 1272  │
└─────────────────────────────────┴───────┘


#### An overview of the balance of the dataset

As we can see, the dataset is very unbalanced towards offensive comments. 
Because of such, we might merge it with other datasets for a better balance.

## ToLD-Br

### ToLD-Br: Data Loading

FYI, the data is classified by offense type, with the values from 0 to 3,
indicating the number of times the comment was classified as offensive.


In [10]:
toldbr = pl.read_parquet(os.path.join(ROOT_PATH, "data/told-br/told-br.parquet.zstd"))

In [11]:
toldbr.sample(5, seed=42)

text,homophobia,obscene,insult,racism,misogyny,xenophobia
str,f64,f64,f64,f64,f64,f64
"""8 demais porra""",0.0,2.0,0.0,0.0,0.0,0.0
"""rt @user @user felizmente os s…",0.0,0.0,0.0,0.0,0.0,0.0
"""caralho tandara #voleinaglobo …",0.0,1.0,0.0,0.0,0.0,0.0
"""vamo c calma mo nosso filho v…",0.0,0.0,0.0,0.0,0.0,0.0
"""que puta sentimento lixo. quer…",0.0,0.0,0.0,0.0,0.0,0.0


### ToLD-Br: Data Distribution

In [12]:
px.histogram(
    toldbr,
    x=['homophobia', 'obscene', 'insult', 'racism', 'misogyny', 'xenophobia'],
    title='ToLD-Br Category Classification Distribution', 
    template='plotly_dark',
).show()

In [13]:
toldbr_off = toldbr.with_columns([
    pl.sum_horizontal(
        pl.col('homophobia'), 
        pl.col('obscene'), 
        pl.col('insult'), 
        pl.col('racism'), 
        pl.col('misogyny'), 
        pl.col('xenophobia'),
    ).alias('offensive'),
])

In [14]:
toldbr_off.sample(5, seed=42)

text,homophobia,obscene,insult,racism,misogyny,xenophobia,offensive
str,f64,f64,f64,f64,f64,f64,f64
"""8 demais porra""",0.0,2.0,0.0,0.0,0.0,0.0,2.0
"""rt @user @user felizmente os s…",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""caralho tandara #voleinaglobo …",0.0,1.0,0.0,0.0,0.0,0.0,1.0
"""vamo c calma mo nosso filho v…",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""que puta sentimento lixo. quer…",0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
px.histogram(
    toldbr_off,
    x='offensive',
    color='offensive',
    title='ToLD-Br Offensive Distribution', 
    template='plotly_dark',
).show()

In [16]:
toldbr_class_threshold = 1

toldbr_discrete = toldbr_off.with_columns([
    pl.when(pl.col('offensive') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_offensive_low'),
    pl.when(pl.col('homophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_homophobia_low'),
    pl.when(pl.col('obscene') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_obscene_low'),
    pl.when(pl.col('insult') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_insult_low'),
    pl.when(pl.col('racism') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_racism_low'),
    pl.when(pl.col('misogyny') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_misogyny_low'),
    pl.when(pl.col('xenophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_xenophobia_low'),
])

toldbr_class_threshold = 2
toldbr_discrete = toldbr_discrete.with_columns([
    pl.when(pl.col('offensive') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_offensive_high'),
    pl.when(pl.col('homophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_homophobia_high'),
    pl.when(pl.col('obscene') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_obscene_high'),
    pl.when(pl.col('insult') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_insult_high'),
    pl.when(pl.col('racism') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_racism_high'),
    pl.when(pl.col('misogyny') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_misogyny_high'),
    pl.when(pl.col('xenophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_xenophobia_high'),
]).select([
    'text', 'is_offensive_low', 
    'is_homophobia_low', 'is_obscene_low', 'is_insult_low', 
    'is_racism_low', 'is_misogyny_low', 'is_xenophobia_low',
    'is_offensive_high', 'is_homophobia_high', 'is_obscene_high', 
    'is_insult_high', 'is_racism_high', 'is_misogyny_high', 'is_xenophobia_high',
])


In [17]:
toldbr_discrete.sample(5, seed=42)

text,is_offensive_low,is_homophobia_low,is_obscene_low,is_insult_low,is_racism_low,is_misogyny_low,is_xenophobia_low,is_offensive_high,is_homophobia_high,is_obscene_high,is_insult_high,is_racism_high,is_misogyny_high,is_xenophobia_high
str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""8 demais porra""",True,False,True,False,False,False,False,True,False,True,False,False,False,False
"""rt @user @user felizmente os s…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""caralho tandara #voleinaglobo …",True,False,True,False,False,False,False,False,False,False,False,False,False,False
"""vamo c calma mo nosso filho v…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""que puta sentimento lixo. quer…",False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
print("ToLD-BR (At least 1 flag)")
print(toldbr_discrete['is_offensive_low'].value_counts())

print("ToLD-BR (At least 2 flags)")
print(toldbr_discrete['is_offensive_high'].value_counts())

ToLD-BR (At least 1 flag)
shape: (2, 2)
┌──────────────────┬───────┐
│ is_offensive_low ┆ count │
│ ---              ┆ ---   │
│ bool             ┆ u32   │
╞══════════════════╪═══════╡
│ true             ┆ 9255  │
│ false            ┆ 11745 │
└──────────────────┴───────┘
ToLD-BR (At least 2 flags)
shape: (2, 2)
┌───────────────────┬───────┐
│ is_offensive_high ┆ count │
│ ---               ┆ ---   │
│ bool              ┆ u32   │
╞═══════════════════╪═══════╡
│ true              ┆ 4816  │
│ false             ┆ 16184 │
└───────────────────┴───────┘


In [19]:
px.histogram(
    toldbr_discrete,
    x='is_offensive_low',
    color='is_offensive_low',
    title='ToLD-Br Offensive Distribution (Discrete Low)', 
    template='plotly_dark',
).show()

px.histogram(
    toldbr_discrete,
    x='is_offensive_high',
    color='is_offensive_high',
    title='ToLD-Br Offensive Distribution (Discrete High)', 
    template='plotly_dark',
).show()

#### An overview of the balance of the dataset

This dataset is way more balanced, when compared to OLID-BR. But, there is a
slight issue with the criteria, it seems. There aren't filtering for profanity
and it seems like one of the labelers considered profanit as obscene, while
the others, not.

When going for a stricter filter, we might use the "Low" variant, while for a
more relaxed one, we might use the "High" variant.

## Simplifying the datasets

In [20]:
olidbr_recompiled.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia,offensive,offensive_allow_profane,offensive_discrete,offensive_allow_profane_discrete
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,u32,u32,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False,0,0,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True


In [21]:
toldbr_discrete.sample(5, seed=42)

text,is_offensive_low,is_homophobia_low,is_obscene_low,is_insult_low,is_racism_low,is_misogyny_low,is_xenophobia_low,is_offensive_high,is_homophobia_high,is_obscene_high,is_insult_high,is_racism_high,is_misogyny_high,is_xenophobia_high
str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""8 demais porra""",True,False,True,False,False,False,False,True,False,True,False,False,False,False
"""rt @user @user felizmente os s…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""caralho tandara #voleinaglobo …",True,False,True,False,False,False,False,False,False,False,False,False,False,False
"""vamo c calma mo nosso filho v…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""que puta sentimento lixo. quer…",False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [22]:
toldbr_final = toldbr_discrete.select([
    pl.lit('ToLD-Br').alias('dataset'),
    pl.col('text').hash().alias('id').cast(pl.String),
    pl.col('text').str.replace_all('@user', 'USER'), # Replace @user with USER, to match OLID-BR
    pl.col('is_offensive_low').alias('off_strict'),
    pl.col('is_offensive_high').alias('off_relaxed'),
])
toldbr_final.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""ToLD-Br""","""12559603335008373494""","""8 demais porra""",True,True
"""ToLD-Br""","""1455693542618679752""","""rt USER USER felizmente os sol…",False,False
"""ToLD-Br""","""17500462964768468320""","""caralho tandara #voleinaglobo …",True,False
"""ToLD-Br""","""2818557597701017591""","""vamo c calma mo nosso filho v…",False,False
"""ToLD-Br""","""14538197625843834331""","""que puta sentimento lixo. quer…",False,False


In [23]:
olidbr_final = olidbr_recompiled.select([
    pl.lit('OLID-Br').alias('dataset'),
    pl.col('id').cast(pl.String),
    'text',
    pl.col('offensive_discrete').alias('off_strict'),
    pl.col('offensive_allow_profane_discrete').alias('off_relaxed'),
])
olidbr_final.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""OLID-Br""","""92882c210108475ba3af23a2a10a4e…","""Muita merda""",True,True
"""OLID-Br""","""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…",True,True
"""OLID-Br""","""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…",True,True
"""OLID-Br""","""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…",False,False
"""OLID-Br""","""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…",True,True


In [24]:
final_dataset = pl.concat([toldbr_final, olidbr_final])
final_dataset.sample(5, seed=534)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""ToLD-Br""","""9322321234464027338""","""pérola tá terrível , ela se ac…",True,False
"""ToLD-Br""","""17106631069393288087""","""USER parabéns pela coragem pq …",True,True
"""ToLD-Br""","""12766216725703474573""","""rt USER porra eu chorei demais…",True,False
"""OLID-Br""","""88aacb3db46a4ff8a70b563cc9e50f…","""USER Evoluiu...antes comia tra…",True,True
"""OLID-Br""","""878d1939c62e4658a6697d20535dcb…","""O presidente é quem manda no p…",True,True


In [25]:
# Write the dataset
os.makedirs(os.path.join(ROOT_PATH, "data/joint"), exist_ok=True)
final_dataset.write_parquet(os.path.join(ROOT_PATH, "data/joint/data.parquet.zstd"), compression="zstd", compression_level=9)

In [26]:
px.histogram(
    final_dataset,
    x='off_strict',
    color='off_strict',
    title='Final Offensive Distribution (Strict)', 
    template='plotly_dark',
).show()

In [27]:
px.histogram(
    final_dataset,
    x='off_relaxed',
    color='off_relaxed',
    title='Final Offensive Distribution (Relaxed)', 
    template='plotly_dark',
).show()

# Dataset Preprocessing

In [38]:
import os
import sys
import re

import polars as pl
from tqdm.notebook import tqdm
from IPython.display import clear_output

import nltk
import spacy
import unidecode

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

## Load Dataset

In [3]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'data.parquet.zstd'))

## Setup Toolkits

In [7]:
nltk.download('punkt')
spacy.cli.download('pt_core_news_sm')
clear_output()

In [30]:
nlp = spacy.load('pt_core_news_sm')

## Normalize

In [39]:
cleanup_re = re.compile(r'[\W\s]')
remove_double_spaces_re = re.compile(r'\s+')

base_clean = []
base_clean_lower = []
tokenized = []
lemmatized = []
no_accents = []
lemma_no_accents = []
no_stop_words = []
lemma_no_stop_words = []
no_stop_words_no_accents = []
lemma_no_stop_words_no_accents = []


# TODO: generalize each pre-processing approach into a separate function
# in a separate file.
for row in tqdm(df.iter_rows(named=True), total=len(df)):
    text: str = row['text']
    # Remove bad characters
    text = cleanup_re.sub(' ', text)
    text = remove_double_spaces_re.sub(' ', text)
    text = text.strip()
    base_clean.append(text)

    # Lowercase
    text = text.lower()
    base_clean_lower.append(text)

    # Tokenize
    # TODO: go deeper into tokenization
    tokens = [token for token in nlp(text)]
    tokenized.append([token.text for token in tokens])

    # Lemmatized
    lemmatized.append([token.lemma_ for token in tokens])

    # No accents
    no_accents.append([unidecode.unidecode(token.text) for token in tokens])
    lemma_no_accents.append([unidecode.unidecode(token.lemma_) for token in tokens])

    # No stop words
    no_stop_words.append([token.text for token in tokens if not token.is_stop])
    lemma_no_stop_words.append([token.lemma_ for token in tokens if not token.is_stop])

    # No stop words, no accents
    no_stop_words_no_accents.append([unidecode.unidecode(token.text) for token in tokens if not token.is_stop])
    lemma_no_stop_words_no_accents.append([unidecode.unidecode(token.lemma_) for token in tokens if not token.is_stop])



df_ext = df.with_columns([
    pl.Series('base_clean', base_clean),
    pl.Series('base_clean_lower', base_clean_lower),
    pl.Series('tokenized', tokenized),
    pl.Series('lemmatized', lemmatized),
    pl.Series('no_accents', no_accents),
    pl.Series('lemma_no_accents', lemma_no_accents),
    pl.Series('no_stop_words', no_stop_words),
    pl.Series('lemma_no_stop_words', lemma_no_stop_words),
    pl.Series('no_stop_words_no_accents', no_stop_words_no_accents),
    pl.Series('lemma_no_stop_words_no_accents', lemma_no_stop_words_no_accents)
])

  0%|          | 0/27952 [00:00<?, ?it/s]

In [41]:
df_ext.write_parquet(os.path.join(ROOT_PATH, "data/joint/pre_processed_data.parquet.zstd"), compression="zstd", compression_level=9)

In [1]:
import os
import sys
import json
import random
from datetime import datetime

import numpy as np
import polars as pl
import plotly.express as px

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:

# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.bow.training import trainer, validate
from toxicity.bow.model import BoWModel, BoWDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 1.8e-05
# Will be overriden bellow
POS_WEIGHT = 1.663


OCC_TRESHOLD = 10
OTHER_TOKEN = '[OTHER]'
MAX_LEN = 256

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

#TODO: shall I balance?

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


## Setup Vocabulary for BoW

In [7]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bow', TIMESTAMP)
os.makedirs(MODEL_PATH, exist_ok=True)

# Count frequency of tokens
occurences = {}
for tokens in df['lemma_no_stop_words_no_accents']:
    for t in tokens:
        if t not in occurences:
            occurences[t] = 1
        else:
            occurences[t] += 1
            
# Sort by frequency
occurences = dict(sorted(occurences.items(), key=lambda item: item[1], reverse=True))

# Build vocabulary
vocabulary = []
for w, counter in occurences.items():
    if counter > OCC_TRESHOLD:
        vocabulary.append(w)

# Sort vocabulary and add a token for other words
vocabulary = sorted(vocabulary)
vocabulary.append(OTHER_TOKEN)

# Vocabulary lookup table
vocab_index = {}
for i, token in enumerate(vocabulary):
    vocab_index[token] = i

# Write vocabulary to file
with open(os.path.join(MODEL_PATH, 'vocab.txt'), 'w') as f:
    f.write('\n'.join(vocabulary))
with open(os.path.join(MODEL_PATH, 'vocab_index.json'), 'w') as f:
    json.dump(vocab_index, f)
with open(os.path.join(MODEL_PATH, 'vocab.json'), 'w') as f:
    json.dump(vocabulary, f)

print(f'Vocabulary size: {len(vocabulary)}')

Vocabulary size: 2832


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = BoWModel(len(vocabulary))
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, other_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
validate_df, test_df = train_test_split(other_df, train_size=0.5, random_state=RANDOM_SEED)


train_loader = DataLoader(
    BoWDataset(train_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    BoWDataset(test_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)
validate_loader = DataLoader(
    BoWDataset(validate_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

train_distribution = train_df['off_relaxed'].to_pandas().value_counts()
neg_count, pos_count = train_distribution.iloc[0], train_distribution.iloc[1]
print(f'Training distribution: {neg_count} negative, {pos_count} positive')
POS_WEIGHT = neg_count / pos_count
print(f'Positive weight: {POS_WEIGHT}')

Training distribution: 13932 negative, 8429 positive
Positive weight: 1.6528651085538024


## Training

In [10]:
def validate_result(loader: DataLoader, model: nn.Module):
    # Validate the results
    raw_results, raw_targets = validate(model, loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    metrics = model_metrics(fixed_targets, fixed_results)
    return metrics

### Training Loop

In [11]:
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'bow', TIMESTAMP)
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bow', TIMESTAMP)
BEST_MODEL_PATH = os.path.join(MODEL_PATH, 'best_model.pt')
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

loss_history = []
metric_history = []
test_metric_history = []

target_metric = ('Weighted F2', 'weighted_f2')
best_metric = float('-inf')
best_epoch = 0

# Save the best model; Override checkpoints; Track metrics
def epoch_callback(epoch, avg_loss):
    global loss_history, metric_history, test_metric_history, target_metric, best_metric, best_epoch
    
    metrics = validate_result(validate_loader, model)
    loss_history.append(avg_loss)
    metric_history.append(metrics)
    test_metrics = validate_result(test_loader, model)
    test_metric_history.append(test_metrics)

    print(f'Epoch {epoch+1}: Loss: {avg_loss:.4f}')
    print(f'Validation {target_metric[0]}: {metrics[target_metric[1]]:.4f}')
    print(f'Test {target_metric[0]}: {test_metrics[target_metric[1]]:.4f}')
    
    if metrics[target_metric[1]] > best_metric:
        print(f'New best model found!')
        best_metric = metrics[target_metric[1]]
        best_epoch = epoch
        torch.save(model, BEST_MODEL_PATH)
    
    

train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 1: Loss: 0.8197
Validation Weighted F2: 0.7333
Test Weighted F2: 0.7297
New best model found!
Finished training epoch 1/20; Average Loss: 0.8197
Running training epoch 2/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 2: Loss: 0.6496
Validation Weighted F2: 0.7730
Test Weighted F2: 0.7634
New best model found!
Finished training epoch 2/20; Average Loss: 0.6496
Running training epoch 3/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 3: Loss: 0.5667
Validation Weighted F2: 0.7765
Test Weighted F2: 0.7712
New best model found!
Finished training epoch 3/20; Average Loss: 0.5667
Running training epoch 4/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 4: Loss: 0.5176
Validation Weighted F2: 0.7791
Test Weighted F2: 0.7746
New best model found!
Finished training epoch 4/20; Average Loss: 0.5176
Running training epoch 5/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 5: Loss: 0.4778
Validation Weighted F2: 0.7801
Test Weighted F2: 0.7753
New best model found!
Finished training epoch 5/20; Average Loss: 0.4778
Running training epoch 6/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 6: Loss: 0.4396
Validation Weighted F2: 0.7750
Test Weighted F2: 0.7625
Finished training epoch 6/20; Average Loss: 0.4396
Running training epoch 7/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 7: Loss: 0.4021
Validation Weighted F2: 0.7711
Test Weighted F2: 0.7596
Finished training epoch 7/20; Average Loss: 0.4021
Running training epoch 8/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 8: Loss: 0.3628
Validation Weighted F2: 0.7671
Test Weighted F2: 0.7578
Finished training epoch 8/20; Average Loss: 0.3628
Running training epoch 9/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 9: Loss: 0.3244
Validation Weighted F2: 0.7687
Test Weighted F2: 0.7572
Finished training epoch 9/20; Average Loss: 0.3244
Running training epoch 10/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 10: Loss: 0.2880
Validation Weighted F2: 0.7628
Test Weighted F2: 0.7510
Finished training epoch 10/20; Average Loss: 0.2880
Running training epoch 11/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 11: Loss: 0.2534
Validation Weighted F2: 0.7659
Test Weighted F2: 0.7524
Finished training epoch 11/20; Average Loss: 0.2534
Running training epoch 12/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 12: Loss: 0.2213
Validation Weighted F2: 0.7649
Test Weighted F2: 0.7568
Finished training epoch 12/20; Average Loss: 0.2213
Running training epoch 13/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 13: Loss: 0.1943
Validation Weighted F2: 0.7555
Test Weighted F2: 0.7449
Finished training epoch 13/20; Average Loss: 0.1943
Running training epoch 14/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 14: Loss: 0.1709
Validation Weighted F2: 0.7630
Test Weighted F2: 0.7528
Finished training epoch 14/20; Average Loss: 0.1709
Running training epoch 15/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 15: Loss: 0.1530
Validation Weighted F2: 0.7551
Test Weighted F2: 0.7498
Finished training epoch 15/20; Average Loss: 0.1530
Running training epoch 16/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 16: Loss: 0.1376
Validation Weighted F2: 0.7509
Test Weighted F2: 0.7425
Finished training epoch 16/20; Average Loss: 0.1376
Running training epoch 17/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 17: Loss: 0.1239
Validation Weighted F2: 0.7536
Test Weighted F2: 0.7483
Finished training epoch 17/20; Average Loss: 0.1239
Running training epoch 18/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 18: Loss: 0.1129
Validation Weighted F2: 0.7474
Test Weighted F2: 0.7445
Finished training epoch 18/20; Average Loss: 0.1129
Running training epoch 19/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 19: Loss: 0.1050
Validation Weighted F2: 0.7510
Test Weighted F2: 0.7444
Finished training epoch 19/20; Average Loss: 0.1050
Running training epoch 20/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 20: Loss: 0.1003
Validation Weighted F2: 0.7528
Test Weighted F2: 0.7477
Finished training epoch 20/20; Average Loss: 0.1003


In [14]:
# Build result metrics data frame per epoch
result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
    'loss': loss_history,
}).with_columns(pl.from_dicts(metric_history))

test_result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
}).with_columns(pl.from_dicts(test_metric_history))

result_df.head()

epoch,loss,weighted_f1,macro_f1,weighted_f2,macro_f2,accuracy,recall,precision
i64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.819739,0.734673,0.719554,0.73333,0.721758,0.732737,0.732737,0.738097
2,0.649648,0.772676,0.757188,0.772958,0.75654,0.773166,0.773166,0.772284
3,0.566674,0.776574,0.762052,0.77646,0.76232,0.776386,0.776386,0.776779
4,0.517649,0.778839,0.763837,0.779076,0.763264,0.779249,0.779249,0.778503
5,0.477843,0.779802,0.764755,0.7801,0.764027,0.780322,0.780322,0.779402


In [13]:
# Plot Loss and Target Metric per Epoch, highlighting the peak
fig_a = px.line(result_df, x='epoch', y='loss', title='Loss per Epoch', template='plotly_dark')
fig_a.add_scatter(x=[best_epoch+1], y=[loss_history[best_epoch]], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_a.update_layout()
fig_a.show()

fig_b = px.line(result_df, x='epoch', y='weighted_f2', title='Validation Weighted F2 per Epoch', template='plotly_dark')
fig_b.add_scatter(x=[best_epoch+1], y=[metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_b.update_layout()
fig_b.show()

fig_c = px.line(test_result_df, x='epoch', y='weighted_f2', title='Test Weighted F2 per Epoch', template='plotly_dark')
fig_c.add_scatter(x=[best_epoch+1], y=[test_metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_c.update_layout()
fig_c.show()

In [1]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
import plotly.express as px

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.embeddings.training import trainer, validate
from toxicity.embeddings.model import EmbeddingModel, EmbeddingDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 1.8e-05
POS_WEIGHT = 1.663


EMBEDDING_FILE = os.path.join(ROOT_PATH, 'cbow_s100.txt')
EMBEDDING_NAME = 'cbow_s100'
MAX_LEN = 128

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:

def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


### Load Embeddings

In [7]:
EMBEDDING_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}')
os.makedirs(EMBEDDING_PATH, exist_ok=True)

emb_dim = None
token_count = None
embeddings = {}

if not os.path.exists(f'{EMBEDDING_PATH}/embeddings.parquet.zstd'):
    with open(EMBEDDING_FILE, 'r') as f:
        fl = f.readline()
        token_count, emb_dim = map(int, fl.split(' '))

        while line := f.readline():
            emb = line.split(' ')

            token = emb[0]
            values = [float(v) for v in emb[1:]]

            if emb_dim is None:
                emb_dim = len(values)
            elif emb_dim != len(values):
                raise ValueError('Inconsistent embedding length')

            embeddings[token] = values
    
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')
    embedding_df = pl.DataFrame({
        'token': list(embeddings.keys()),
        'embedding': list(embeddings.values())
    })
    embedding_df.write_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd', compression="zstd", compression_level=9)
else:
    embedding_df = pl.read_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd')
    token_list = embedding_df['token'].to_list()
    embedding_list = embedding_df['embedding'].to_list()
    embeddings = dict(zip(token_list, embedding_list))
    emb_dim = len(embeddings[next(iter(embeddings))])
    token_count = len(embeddings)
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')


Embedding Length: 100
Embedding Vocab Size: 929606; Expected: 929606


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = EmbeddingModel(emb_dim, MAX_LEN)
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, other_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
validate_df, test_df = train_test_split(other_df, train_size=0.5, random_state=RANDOM_SEED)

train_loader = DataLoader(
    EmbeddingDataset(train_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    EmbeddingDataset(test_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)
validate_loader = DataLoader(
    EmbeddingDataset(validate_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

train_distribution = train_df['off_relaxed'].to_pandas().value_counts()
neg_count, pos_count = train_distribution.iloc[0], train_distribution.iloc[1]
print(f'Training distribution: {neg_count} negative, {pos_count} positive')
POS_WEIGHT = neg_count / pos_count
print(f'Positive weight: {POS_WEIGHT}')

Training distribution: 13932 negative, 8429 positive
Positive weight: 1.6528651085538024


## Training

In [10]:
def validate_result(loader: DataLoader, model: nn.Module):
    # Validate the results
    raw_results, raw_targets = validate(model, loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    metrics = model_metrics(fixed_targets, fixed_results)
    return metrics

### Training Loop

In [11]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
BEST_MODEL_PATH = os.path.join(MODEL_PATH, 'best_model.pt')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

loss_history = []
metric_history = []
test_metric_history = []
target_metric = ('Weighted F2', 'weighted_f2')
best_metric = float('-inf')
best_epoch = 0

# Save the best model; Override checkpoints; Track metrics
def epoch_callback(epoch, avg_loss):
    global loss_history, metric_history, test_metric_history, target_metric, best_metric, best_epoch
    
    metrics = validate_result(validate_loader, model)
    loss_history.append(avg_loss)
    metric_history.append(metrics)
    test_metrics = validate_result(test_loader, model)
    test_metric_history.append(test_metrics)

    print(f'Epoch {epoch+1}: Loss: {avg_loss:.4f}')
    print(f'Validation {target_metric[0]}: {metrics[target_metric[1]]:.4f}')
    print(f'Test {target_metric[0]}: {test_metrics[target_metric[1]]:.4f}')
    
    if metrics[target_metric[1]] > best_metric:
        print(f'New best model found!')
        best_metric = metrics[target_metric[1]]
        best_epoch = epoch
        torch.save(model, BEST_MODEL_PATH)



train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 1: Loss: 0.8217
Validation Weighted F2: 0.6255
Test Weighted F2: 0.6190
New best model found!
Finished training epoch 1/20; Average Loss: 0.8217
Running training epoch 2/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 2: Loss: 0.7304
Validation Weighted F2: 0.6832
Test Weighted F2: 0.6753
New best model found!
Finished training epoch 2/20; Average Loss: 0.7304
Running training epoch 3/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 3: Loss: 0.6899
Validation Weighted F2: 0.6846
Test Weighted F2: 0.6699
New best model found!
Finished training epoch 3/20; Average Loss: 0.6899
Running training epoch 4/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 4: Loss: 0.6541
Validation Weighted F2: 0.6995
Test Weighted F2: 0.6863
New best model found!
Finished training epoch 4/20; Average Loss: 0.6541
Running training epoch 5/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 5: Loss: 0.6136
Validation Weighted F2: 0.6933
Test Weighted F2: 0.6792
Finished training epoch 5/20; Average Loss: 0.6136
Running training epoch 6/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 6: Loss: 0.5669
Validation Weighted F2: 0.6923
Test Weighted F2: 0.6831
Finished training epoch 6/20; Average Loss: 0.5669
Running training epoch 7/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 7: Loss: 0.5122
Validation Weighted F2: 0.6846
Test Weighted F2: 0.6877
Finished training epoch 7/20; Average Loss: 0.5122
Running training epoch 8/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 8: Loss: 0.4535
Validation Weighted F2: 0.6952
Test Weighted F2: 0.7009
Finished training epoch 8/20; Average Loss: 0.4535
Running training epoch 9/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 9: Loss: 0.3976
Validation Weighted F2: 0.6911
Test Weighted F2: 0.6890
Finished training epoch 9/20; Average Loss: 0.3976
Running training epoch 10/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 10: Loss: 0.3458
Validation Weighted F2: 0.6864
Test Weighted F2: 0.6921
Finished training epoch 10/20; Average Loss: 0.3458
Running training epoch 11/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 11: Loss: 0.2979
Validation Weighted F2: 0.6878
Test Weighted F2: 0.7034
Finished training epoch 11/20; Average Loss: 0.2979
Running training epoch 12/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 12: Loss: 0.2617
Validation Weighted F2: 0.6893
Test Weighted F2: 0.6935
Finished training epoch 12/20; Average Loss: 0.2617
Running training epoch 13/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 13: Loss: 0.2258
Validation Weighted F2: 0.6814
Test Weighted F2: 0.6874
Finished training epoch 13/20; Average Loss: 0.2258
Running training epoch 14/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 14: Loss: 0.1985
Validation Weighted F2: 0.6799
Test Weighted F2: 0.6782
Finished training epoch 14/20; Average Loss: 0.1985
Running training epoch 15/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 15: Loss: 0.1768
Validation Weighted F2: 0.6777
Test Weighted F2: 0.6731
Finished training epoch 15/20; Average Loss: 0.1768
Running training epoch 16/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 16: Loss: 0.1624
Validation Weighted F2: 0.6715
Test Weighted F2: 0.6725
Finished training epoch 16/20; Average Loss: 0.1624
Running training epoch 17/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 17: Loss: 0.1426
Validation Weighted F2: 0.6741
Test Weighted F2: 0.6737
Finished training epoch 17/20; Average Loss: 0.1426
Running training epoch 18/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 18: Loss: 0.1319
Validation Weighted F2: 0.6742
Test Weighted F2: 0.6636
Finished training epoch 18/20; Average Loss: 0.1319
Running training epoch 19/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 19: Loss: 0.1180
Validation Weighted F2: 0.6815
Test Weighted F2: 0.6797
Finished training epoch 19/20; Average Loss: 0.1180
Running training epoch 20/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 20: Loss: 0.1131
Validation Weighted F2: 0.6793
Test Weighted F2: 0.6800
Finished training epoch 20/20; Average Loss: 0.1131


In [12]:
# Build result metrics data frame per epoch
result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
    'loss': loss_history,
}).with_columns(pl.from_dicts(metric_history))

test_result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
}).with_columns(pl.from_dicts(test_metric_history))

result_df.head()


epoch,loss,weighted_f1,macro_f1,weighted_f2,macro_f2,accuracy,recall,precision
i64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.821739,0.633112,0.626195,0.625527,0.635294,0.628265,0.628265,0.676534
2,0.730445,0.686137,0.670944,0.68317,0.674477,0.68229,0.68229,0.695327
3,0.689854,0.687161,0.67122,0.684619,0.674221,0.683721,0.683721,0.694494
4,0.65406,0.69952,0.679777,0.699486,0.679818,0.699463,0.699463,0.699578
5,0.61356,0.694634,0.676868,0.693325,0.678434,0.692665,0.692665,0.697648


In [13]:
# Plot Loss and Target Metric per Epoch, highlighting the peak
fig_a = px.line(result_df, x='epoch', y='loss', title='Loss per Epoch', template='plotly_dark')
fig_a.add_scatter(x=[best_epoch+1], y=[loss_history[best_epoch]], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_a.update_layout()
fig_a.show()

fig_b = px.line(result_df, x='epoch', y='weighted_f2', title='Validation Weighted F2 per Epoch', template='plotly_dark')
fig_b.add_scatter(x=[best_epoch+1], y=[metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_b.update_layout()
fig_b.show()

fig_c = px.line(test_result_df, x='epoch', y='weighted_f2', title='Test Weighted F2 per Epoch', template='plotly_dark')
fig_c.add_scatter(x=[best_epoch+1], y=[test_metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_c.update_layout()
fig_c.show()

# Neural Solution - Transformers: BERT

In [1]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
import plotly.express as px
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.transformers.bertimbau_base import bert_tokenizer, BertDatasetBF16, BertModuleBF16
from toxicity.transformers.training import trainer, validate
from toxicity.training import train_epochs, model_metrics

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 3e-05
POS_WEIGHT = 1.663

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""5508727285226739644""","""medo de ir pra um rolê de novo…",[0],[0],"""medo de ir pra um rolê de novo…","""medo de ir pra um rolê de novo…","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""de"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]","[""medo"", ""pra"", … ""kkkkkkk""]"
"""ToLD-Br""","""16827841903506270139""","""https://t.co/2bs6oD330q Ele a…",[0],[0],"""https t co 2bs6oD330q Ele até …","""https t co 2bs6od330q ele até …","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]","[""https"", ""t"", … ""gd2j98vrkg""]"
"""ToLD-Br""","""7641628880024884135""","""rt USER bruno fernandes assina…",[0],[0],"""rt USER bruno fernandes assina…","""rt user bruno fernandes assina…","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]","[""rt"", ""user"", … ""user""]"
"""ToLD-Br""","""16866242508514532033""","""tinha que ter jogado esse bran…",[1],[1],"""tinha que ter jogado esse bran…","""tinha que ter jogado esse bran…","[""tinha"", ""que"", … ""trem""]","[""ter"", ""que"", … ""tr""]","[""tinha"", ""que"", … ""trem""]","[""ter"", ""que"", … ""tr""]","[""tinha"", ""jogado"", … ""trem""]","[""ter"", ""jogar"", … ""tr""]","[""tinha"", ""jogado"", … ""trem""]","[""ter"", ""jogar"", … ""tr""]"
"""ToLD-Br""","""3068271252403811869""","""eu sou a pessoa certa no bairr…",[0],[0],"""eu sou a pessoa certa no bairr…","""eu sou a pessoa certa no bairr…","[""eu"", ""sou"", … ""errado""]","[""eu"", ""ser"", … ""errar""]","[""eu"", ""sou"", … ""errado""]","[""eu"", ""ser"", … ""errar""]","[""pessoa"", ""certa"", … ""errado""]","[""pessoa"", ""certo"", … ""errar""]","[""pessoa"", ""certa"", … ""errado""]","[""pessoa"", ""certo"", … ""errar""]"


## Setup Model

In [7]:
tokenizer = bert_tokenizer()

model = BertModuleBF16(feature_count=1)
model.to(PYTORCH_DEVICE)

BertModuleBF16(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Data Split

In [8]:

train_df, other_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
validate_df, test_df = train_test_split(other_df, train_size=0.5, random_state=RANDOM_SEED)

train_loader = DataLoader(
    BertDatasetBF16(data_frame=train_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    BertDatasetBF16(data_frame=test_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)
validate_loader = DataLoader(
    BertDatasetBF16(data_frame=validate_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

train_distribution = train_df['off_relaxed'].to_pandas().value_counts()
neg_count, pos_count = train_distribution.iloc[0], train_distribution.iloc[1]
print(f'Training distribution: {neg_count} negative, {pos_count} positive')
POS_WEIGHT = neg_count / pos_count
print(f'Positive weight: {POS_WEIGHT}')

Training distribution: 13932 negative, 8429 positive
Positive weight: 1.6528651085538024


### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks, with good known results for BERT models.

In [9]:
loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

### Training

### Result Validation

In [10]:
def validate_result(loader: DataLoader, model: nn.Module):
    # Validate the results
    raw_results, raw_targets = validate(model, loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    metrics = model_metrics(fixed_targets, fixed_results)
    return metrics

### Train Model

In [11]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'bertimbau-bf16', TIMESTAMP)
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bertimbau-bf16', TIMESTAMP)
BEST_MODEL_PATH = os.path.join(MODEL_PATH, 'best_model.pt')
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

loss_history = []
metric_history = []
test_metric_history = []
target_metric = ('Weighted F2', 'weighted_f2')
best_metric = float('-inf')
best_epoch = 0

# Save the best model; Override checkpoints; Track metrics
def epoch_callback(epoch, avg_loss):
    global loss_history, metric_history, test_metric_history, target_metric, best_metric, best_epoch
    
    metrics = validate_result(validate_loader, model)
    loss_history.append(avg_loss)
    metric_history.append(metrics)
    test_metrics = validate_result(test_loader, model)
    test_metric_history.append(test_metrics)

    print(f'Epoch {epoch+1}: Loss: {avg_loss:.4f}')
    print(f'Validation {target_metric[0]}: {metrics[target_metric[1]]:.4f}')
    print(f'Test {target_metric[0]}: {test_metrics[target_metric[1]]:.4f}')
    
    if metrics[target_metric[1]] > best_metric:
        print(f'New best model found!')
        best_metric = metrics[target_metric[1]]
        best_epoch = epoch
        torch.save(model, BEST_MODEL_PATH)



train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 1: Loss: 0.6361
Validation Weighted F2: 0.7883
Test Weighted F2: 0.7842
New best model found!
Finished training epoch 1/20; Average Loss: 0.6361
Running training epoch 2/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 2: Loss: 0.5438
Validation Weighted F2: 0.8025
Test Weighted F2: 0.7976
New best model found!
Finished training epoch 2/20; Average Loss: 0.5438
Running training epoch 3/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 3: Loss: 0.4995
Validation Weighted F2: 0.7892
Test Weighted F2: 0.7855
Finished training epoch 3/20; Average Loss: 0.4995
Running training epoch 4/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 4: Loss: 0.4715
Validation Weighted F2: 0.8031
Test Weighted F2: 0.7961
New best model found!
Finished training epoch 4/20; Average Loss: 0.4715
Running training epoch 5/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 5: Loss: 0.4431
Validation Weighted F2: 0.8082
Test Weighted F2: 0.8016
New best model found!
Finished training epoch 5/20; Average Loss: 0.4431
Running training epoch 6/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 6: Loss: 0.4195
Validation Weighted F2: 0.8010
Test Weighted F2: 0.7961
Finished training epoch 6/20; Average Loss: 0.4195
Running training epoch 7/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 7: Loss: 0.3994
Validation Weighted F2: 0.7844
Test Weighted F2: 0.7837
Finished training epoch 7/20; Average Loss: 0.3994
Running training epoch 8/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 8: Loss: 0.3765
Validation Weighted F2: 0.7941
Test Weighted F2: 0.7935
Finished training epoch 8/20; Average Loss: 0.3765
Running training epoch 9/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 9: Loss: 0.3608
Validation Weighted F2: 0.7982
Test Weighted F2: 0.7915
Finished training epoch 9/20; Average Loss: 0.3608
Running training epoch 10/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 10: Loss: 0.3436
Validation Weighted F2: 0.7997
Test Weighted F2: 0.7955
Finished training epoch 10/20; Average Loss: 0.3436
Running training epoch 11/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 11: Loss: 0.3334
Validation Weighted F2: 0.7943
Test Weighted F2: 0.7980
Finished training epoch 11/20; Average Loss: 0.3334
Running training epoch 12/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 12: Loss: 0.3170
Validation Weighted F2: 0.7860
Test Weighted F2: 0.7857
Finished training epoch 12/20; Average Loss: 0.3170
Running training epoch 13/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 13: Loss: 0.3075
Validation Weighted F2: 0.7830
Test Weighted F2: 0.7830
Finished training epoch 13/20; Average Loss: 0.3075
Running training epoch 14/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 14: Loss: 0.2995
Validation Weighted F2: 0.7986
Test Weighted F2: 0.7984
Finished training epoch 14/20; Average Loss: 0.2995
Running training epoch 15/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 15: Loss: 0.2885
Validation Weighted F2: 0.7922
Test Weighted F2: 0.7927
Finished training epoch 15/20; Average Loss: 0.2885
Running training epoch 16/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 16: Loss: 0.2818
Validation Weighted F2: 0.7957
Test Weighted F2: 0.7941
Finished training epoch 16/20; Average Loss: 0.2818
Running training epoch 17/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 17: Loss: 0.2704
Validation Weighted F2: 0.7861
Test Weighted F2: 0.7876
Finished training epoch 17/20; Average Loss: 0.2704
Running training epoch 18/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 18: Loss: 0.2652
Validation Weighted F2: 0.7882
Test Weighted F2: 0.7891
Finished training epoch 18/20; Average Loss: 0.2652
Running training epoch 19/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 19: Loss: 0.2608
Validation Weighted F2: 0.7825
Test Weighted F2: 0.7883
Finished training epoch 19/20; Average Loss: 0.2608
Running training epoch 20/20


  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

Epoch 20: Loss: 0.2557
Validation Weighted F2: 0.7839
Test Weighted F2: 0.7847
Finished training epoch 20/20; Average Loss: 0.2557


In [12]:
# Build result metrics data frame per epoch
result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
    'loss': loss_history,
}).with_columns(pl.from_dicts(metric_history))

test_result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
}).with_columns(pl.from_dicts(test_metric_history))

result_df.head()


epoch,loss,weighted_f1,macro_f1,weighted_f2,macro_f2,accuracy,recall,precision
i64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.636126,0.791011,0.781505,0.788261,0.787686,0.788193,0.788193,0.802364
2,0.543765,0.804206,0.794056,0.802465,0.798546,0.802147,0.802147,0.810366
3,0.499501,0.792718,0.784253,0.789249,0.791817,0.789624,0.789624,0.808785
4,0.471502,0.805076,0.795277,0.80312,0.800267,0.802862,0.802862,0.812368
5,0.443094,0.809315,0.798525,0.808227,0.801579,0.807871,0.807871,0.81257


In [13]:
# Plot Loss and Target Metric per Epoch, highlighting the peak
fig_a = px.line(result_df, x='epoch', y='loss', title='Loss per Epoch', template='plotly_dark')
fig_a.add_scatter(x=[best_epoch+1], y=[loss_history[best_epoch]], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_a.update_layout()
fig_a.show()

fig_b = px.line(result_df, x='epoch', y='weighted_f2', title='Validation Weighted F2 per Epoch', template='plotly_dark')
fig_b.add_scatter(x=[best_epoch+1], y=[metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_b.update_layout()
fig_b.show()

fig_c = px.line(test_result_df, x='epoch', y='weighted_f2', title='Test Weighted F2 per Epoch', template='plotly_dark')
fig_c.add_scatter(x=[best_epoch+1], y=[test_metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_c.update_layout()
fig_c.show()