# Exploring the Datasets

In [1]:
import os

import polars as pl
import plotly.express as px

from IPython.display import display

## Compress original data

In [2]:
ROOT_PATH = '../'

try:
    _olid_train = pl.read_csv(os.path.join(ROOT_PATH, "data/olid-br/train.csv"))
    _olid_train.write_parquet(os.path.join(ROOT_PATH, "data/olid-br/train.parquet.zstd"), compression="zstd", compression_level=9)

    _olid_test = pl.read_csv(os.path.join(ROOT_PATH, "data/olid-br/test.csv"))
    _olid_test.write_parquet(os.path.join(ROOT_PATH, "data/olid-br/test.parquet.zstd"), compression="zstd", compression_level=9)

    _told = pl.read_csv(os.path.join(ROOT_PATH, "data/told-br/told-br.csv"))
    _told.write_parquet(os.path.join(ROOT_PATH, "data/told-br/told-br.parquet.zstd"), compression="zstd", compression_level=9)
except:
    pass

## OLID-BR

### OLID-BR: Data Loading

In [3]:
olidbr_train = pl.read_parquet(os.path.join(ROOT_PATH, "data/olid-br/train.parquet.zstd"))
olidbr_test = pl.read_parquet(os.path.join(ROOT_PATH, "data/olid-br/test.parquet.zstd"))

In [4]:
# Join the train and test datasets
olidbr = pl.concat([olidbr_train, olidbr_test])

# Display 5 random samples
olidbr.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False


### OLID-BR: Data Distribution

In [5]:
print("OLID-BR Train")
print(olidbr_train['is_offensive'].value_counts())

print("OLID-BR Test")
print(olidbr_test['is_offensive'].value_counts())

print("OLID-BR Full")
print(olidbr['is_offensive'].value_counts())


OLID-BR Train
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 762   │
│ OFF          ┆ 4452  │
└──────────────┴───────┘
OLID-BR Test
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 254   │
│ OFF          ┆ 1484  │
└──────────────┴───────┘
OLID-BR Full
shape: (2, 2)
┌──────────────┬───────┐
│ is_offensive ┆ count │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ NOT          ┆ 1016  │
│ OFF          ┆ 5936  │
└──────────────┴───────┘


In [6]:
px.histogram(
    olidbr, 
    x='is_offensive', 
    color='is_offensive', 
    title='OLID-BR Offensive Distribution', 
    template='plotly_dark',
).show()

### Reclassify Offensiveness

In [7]:
# Recompiling the dataset with the sum of all offensive categories
olidbr_recompiled = olidbr.with_columns([
    pl.sum_horizontal(
        pl.col("profanity_obscene"), 
        pl.col("health"), 
        pl.col("ideology"), 
        pl.col("insult"), 
        pl.col("lgbtqphobia"), 
        pl.col("other_lifestyle"), 
        pl.col("physical_aspects"), 
        pl.col("racism"), 
        pl.col("religious_intolerance"), 
        pl.col("sexism"), 
        pl.col("xenophobia"),
    ).alias("offensive"),
    pl.sum_horizontal(
        pl.col("health"), 
        pl.col("ideology"), 
        pl.col("insult"), 
        pl.col("lgbtqphobia"), 
        pl.col("other_lifestyle"), 
        pl.col("physical_aspects"), 
        pl.col("racism"), 
        pl.col("religious_intolerance"), 
        pl.col("sexism"), 
        pl.col("xenophobia"),
    ).alias("offensive_allow_profane")
])

olidbr_recompiled = olidbr_recompiled.with_columns(
    [
        pl.when(pl.col("offensive") > 0).then(True).otherwise(False).alias("offensive_discrete"),
        pl.when(pl.col("offensive_allow_profane") > 0).then(True).otherwise(False).alias("offensive_allow_profane_discrete"),
    ]
)

In [8]:
olidbr_recompiled.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia,offensive,offensive_allow_profane,offensive_discrete,offensive_allow_profane_discrete
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,u32,u32,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False,0,0,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True


In [9]:
print("OLID-BR Recompiled")
print(olidbr_recompiled['offensive_discrete'].value_counts())

print("OLID-BR Recompiled (Allow Profane)")
print(olidbr_recompiled['offensive_allow_profane_discrete'].value_counts())

OLID-BR Recompiled
shape: (2, 2)
┌────────────────────┬───────┐
│ offensive_discrete ┆ count │
│ ---                ┆ ---   │
│ bool               ┆ u32   │
╞════════════════════╪═══════╡
│ false              ┆ 1027  │
│ true               ┆ 5925  │
└────────────────────┴───────┘
OLID-BR Recompiled (Allow Profane)
shape: (2, 2)
┌─────────────────────────────────┬───────┐
│ offensive_allow_profane_discre… ┆ count │
│ ---                             ┆ ---   │
│ bool                            ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ true                            ┆ 5680  │
│ false                           ┆ 1272  │
└─────────────────────────────────┴───────┘


#### An overview of the balance of the dataset

As we can see, the dataset is very unbalanced towards offensive comments. 
Because of such, we might merge it with other datasets for a better balance.

## ToLD-Br

### ToLD-Br: Data Loading

FYI, the data is classified by offense type, with the values from 0 to 3,
indicating the number of times the comment was classified as offensive.


In [10]:
toldbr = pl.read_parquet(os.path.join(ROOT_PATH, "data/told-br/told-br.parquet.zstd"))

In [11]:
toldbr.sample(5, seed=42)

text,homophobia,obscene,insult,racism,misogyny,xenophobia
str,f64,f64,f64,f64,f64,f64
"""8 demais porra""",0.0,2.0,0.0,0.0,0.0,0.0
"""rt @user @user felizmente os s…",0.0,0.0,0.0,0.0,0.0,0.0
"""caralho tandara #voleinaglobo …",0.0,1.0,0.0,0.0,0.0,0.0
"""vamo c calma mo nosso filho v…",0.0,0.0,0.0,0.0,0.0,0.0
"""que puta sentimento lixo. quer…",0.0,0.0,0.0,0.0,0.0,0.0


### ToLD-Br: Data Distribution

In [12]:
px.histogram(
    toldbr,
    x=['homophobia', 'obscene', 'insult', 'racism', 'misogyny', 'xenophobia'],
    title='ToLD-Br Category Classification Distribution', 
    template='plotly_dark',
).show()

In [13]:
toldbr_off = toldbr.with_columns([
    pl.sum_horizontal(
        pl.col('homophobia'), 
        pl.col('obscene'), 
        pl.col('insult'), 
        pl.col('racism'), 
        pl.col('misogyny'), 
        pl.col('xenophobia'),
    ).alias('offensive'),
])

In [14]:
toldbr_off.sample(5, seed=42)

text,homophobia,obscene,insult,racism,misogyny,xenophobia,offensive
str,f64,f64,f64,f64,f64,f64,f64
"""8 demais porra""",0.0,2.0,0.0,0.0,0.0,0.0,2.0
"""rt @user @user felizmente os s…",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""caralho tandara #voleinaglobo …",0.0,1.0,0.0,0.0,0.0,0.0,1.0
"""vamo c calma mo nosso filho v…",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""que puta sentimento lixo. quer…",0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
px.histogram(
    toldbr_off,
    x='offensive',
    color='offensive',
    title='ToLD-Br Offensive Distribution', 
    template='plotly_dark',
).show()

In [16]:
toldbr_class_threshold = 1

toldbr_discrete = toldbr_off.with_columns([
    pl.when(pl.col('offensive') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_offensive_low'),
    pl.when(pl.col('homophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_homophobia_low'),
    pl.when(pl.col('obscene') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_obscene_low'),
    pl.when(pl.col('insult') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_insult_low'),
    pl.when(pl.col('racism') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_racism_low'),
    pl.when(pl.col('misogyny') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_misogyny_low'),
    pl.when(pl.col('xenophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_xenophobia_low'),
])

toldbr_class_threshold = 2
toldbr_discrete = toldbr_discrete.with_columns([
    pl.when(pl.col('offensive') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_offensive_high'),
    pl.when(pl.col('homophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_homophobia_high'),
    pl.when(pl.col('obscene') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_obscene_high'),
    pl.when(pl.col('insult') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_insult_high'),
    pl.when(pl.col('racism') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_racism_high'),
    pl.when(pl.col('misogyny') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_misogyny_high'),
    pl.when(pl.col('xenophobia') >= toldbr_class_threshold).
        then(True).
        otherwise(False).
        alias('is_xenophobia_high'),
]).select([
    'text', 'is_offensive_low', 
    'is_homophobia_low', 'is_obscene_low', 'is_insult_low', 
    'is_racism_low', 'is_misogyny_low', 'is_xenophobia_low',
    'is_offensive_high', 'is_homophobia_high', 'is_obscene_high', 
    'is_insult_high', 'is_racism_high', 'is_misogyny_high', 'is_xenophobia_high',
])


In [17]:
toldbr_discrete.sample(5, seed=42)

text,is_offensive_low,is_homophobia_low,is_obscene_low,is_insult_low,is_racism_low,is_misogyny_low,is_xenophobia_low,is_offensive_high,is_homophobia_high,is_obscene_high,is_insult_high,is_racism_high,is_misogyny_high,is_xenophobia_high
str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""8 demais porra""",True,False,True,False,False,False,False,True,False,True,False,False,False,False
"""rt @user @user felizmente os s…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""caralho tandara #voleinaglobo …",True,False,True,False,False,False,False,False,False,False,False,False,False,False
"""vamo c calma mo nosso filho v…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""que puta sentimento lixo. quer…",False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
print("ToLD-BR (At least 1 flag)")
print(toldbr_discrete['is_offensive_low'].value_counts())

print("ToLD-BR (At least 2 flags)")
print(toldbr_discrete['is_offensive_high'].value_counts())

ToLD-BR (At least 1 flag)
shape: (2, 2)
┌──────────────────┬───────┐
│ is_offensive_low ┆ count │
│ ---              ┆ ---   │
│ bool             ┆ u32   │
╞══════════════════╪═══════╡
│ false            ┆ 11745 │
│ true             ┆ 9255  │
└──────────────────┴───────┘
ToLD-BR (At least 2 flags)
shape: (2, 2)
┌───────────────────┬───────┐
│ is_offensive_high ┆ count │
│ ---               ┆ ---   │
│ bool              ┆ u32   │
╞═══════════════════╪═══════╡
│ true              ┆ 4816  │
│ false             ┆ 16184 │
└───────────────────┴───────┘


In [19]:
px.histogram(
    toldbr_discrete,
    x='is_offensive_low',
    color='is_offensive_low',
    title='ToLD-Br Offensive Distribution (Discrete Low)', 
    template='plotly_dark',
).show()

px.histogram(
    toldbr_discrete,
    x='is_offensive_high',
    color='is_offensive_high',
    title='ToLD-Br Offensive Distribution (Discrete High)', 
    template='plotly_dark',
).show()

#### An overview of the balance of the dataset

This dataset is way more balanced, when compared to OLID-BR. But, there is a
slight issue with the criteria, it seems. There aren't filtering for profanity
and it seems like one of the labelers considered profanit as obscene, while
the others, not.

When going for a stricter filter, we might use the "Low" variant, while for a
more relaxed one, we might use the "High" variant.

## Simplifying the datasets

In [20]:
olidbr_recompiled.sample(5, seed=42)

id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia,offensive,offensive_allow_profane,offensive_discrete,offensive_allow_profane_discrete
str,str,str,str,str,str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,u32,u32,bool,bool
"""92882c210108475ba3af23a2a10a4e…","""Muita merda""","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…","""OFF""","""TIN""","""GRP""","""[102, 103, 104, 105, 106, 107,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…","""OFF""","""TIN""","""IND""","""[0, 1, 2, 3, 4, 5, 6, 7, 8, 32…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True
"""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…","""NOT""","""UNT""",,"""[]""",False,False,False,False,False,False,False,False,False,False,False,0,0,False,False
"""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…","""OFF""","""UNT""",,"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,…",False,False,True,False,False,False,False,False,False,False,False,1,1,True,True


In [21]:
toldbr_discrete.sample(5, seed=42)

text,is_offensive_low,is_homophobia_low,is_obscene_low,is_insult_low,is_racism_low,is_misogyny_low,is_xenophobia_low,is_offensive_high,is_homophobia_high,is_obscene_high,is_insult_high,is_racism_high,is_misogyny_high,is_xenophobia_high
str,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool
"""8 demais porra""",True,False,True,False,False,False,False,True,False,True,False,False,False,False
"""rt @user @user felizmente os s…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""caralho tandara #voleinaglobo …",True,False,True,False,False,False,False,False,False,False,False,False,False,False
"""vamo c calma mo nosso filho v…",False,False,False,False,False,False,False,False,False,False,False,False,False,False
"""que puta sentimento lixo. quer…",False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [22]:
toldbr_final = toldbr_discrete.select([
    pl.lit('ToLD-Br').alias('dataset'),
    pl.col('text').hash().alias('id').cast(pl.String),
    pl.col('text').str.replace_all('@user', 'USER'), # Replace @user with USER, to match OLID-BR
    pl.col('is_offensive_low').alias('off_strict'),
    pl.col('is_offensive_high').alias('off_relaxed'),
])
toldbr_final.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""ToLD-Br""","""12559603335008373494""","""8 demais porra""",True,True
"""ToLD-Br""","""1455693542618679752""","""rt USER USER felizmente os sol…",False,False
"""ToLD-Br""","""17500462964768468320""","""caralho tandara #voleinaglobo …",True,False
"""ToLD-Br""","""2818557597701017591""","""vamo c calma mo nosso filho v…",False,False
"""ToLD-Br""","""14538197625843834331""","""que puta sentimento lixo. quer…",False,False


In [23]:
olidbr_final = olidbr_recompiled.select([
    pl.lit('OLID-Br').alias('dataset'),
    pl.col('id').cast(pl.String),
    'text',
    pl.col('offensive_discrete').alias('off_strict'),
    pl.col('offensive_allow_profane_discrete').alias('off_relaxed'),
])
olidbr_final.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""OLID-Br""","""92882c210108475ba3af23a2a10a4e…","""Muita merda""",True,True
"""OLID-Br""","""81ddfdb7d97442f29e7808531e6aa2…","""USER de idade não pode respond…",True,True
"""OLID-Br""","""6c07f22150fd41f4a54efbfaf30bd2…","""RIDICULA, USER USER, PREPOTENT…",True,True
"""OLID-Br""","""a9edcee25e5a409a8ef66c603d89ed…","""USER mesmo, porque até os anos…",False,False
"""OLID-Br""","""29891496348a4638841fc59db34821…","""lixo total, uma propaganda des…",True,True


In [24]:
final_dataset = pl.concat([toldbr_final, olidbr_final])
final_dataset.sample(5, seed=534)

dataset,id,text,off_strict,off_relaxed
str,str,str,bool,bool
"""ToLD-Br""","""9322321234464027338""","""pérola tá terrível , ela se ac…",True,False
"""ToLD-Br""","""17106631069393288087""","""USER parabéns pela coragem pq …",True,True
"""ToLD-Br""","""12766216725703474573""","""rt USER porra eu chorei demais…",True,False
"""OLID-Br""","""88aacb3db46a4ff8a70b563cc9e50f…","""USER Evoluiu...antes comia tra…",True,True
"""OLID-Br""","""878d1939c62e4658a6697d20535dcb…","""O presidente é quem manda no p…",True,True


In [None]:
# Write the dataset
os.makedirs(os.path.join(ROOT_PATH, "data/joint"), exist_ok=True)
final_dataset.write_parquet(os.path.join(ROOT_PATH, "data/joint/data.parquet.zstd"), compression="zstd", compression_level=9)