# Hugging Face Transformers
Bastián González-Bustamante \
Universidad Diego Portales \
Noviembre 2024

In [24]:
## Dependencies
import pandas as pd
from datetime import datetime
from transformers import pipeline
from tqdm import tqdm
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
## Loading transformer trained on protest event data: F1-Score: 0.849
toxic_classifier_1 = pipeline("text-classification", model="bgonzalezbustamante/bert-spanish-toxicity")

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/730k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [4]:
## Loading transformer trained on protest event data: F1-Score: 0.789
toxic_classifier_2 = pipeline("text-classification", model="bgonzalezbustamante/ft-xlm-roberta-toxicity")

config.json:   0%|          | 0.00/915 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [5]:
## Loading transformer trained on protest event data: F1-Score: 0.767 (not recommended)
toxic_classifier_3 = pipeline("text-classification", model="bgonzalezbustamante/ft-roberta-toxicity") 

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

In [8]:
## Load Transformers
toxic_classifier_1

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x1e87b45b050>

In [11]:
## Read Data
training_data = pd.read_csv("hf://datasets/bgonzalezbustamante/toxicity-protests-ES/goldstd_protests.csv") ## Overfitted
test_data = pd.read_csv("hf://datasets/bgonzalezbustamante/toxicity-Constitution-ES/goldstd_Convention.csv")
training_data.head()

Unnamed: 0,id_obs,coder_1,coder_2,consensus,sec_create_1,sec_create_2,sec_review_1,sec_review_2,possibly_sensitive,lang,...,THREAT,date,tox_60,tox_70,tox_80,tox_90,insult_60,insult_70,insult_80,insult_90
0,101238,0,0,1.0,46,28,17,8,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
1,119343,0,0,1.0,8,6,0,2,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
2,122343,0,0,1.0,8,6,1,0,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
3,131878,0,0,1.0,4,52,0,1,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
4,132171,0,0,1.0,6,15,0,1,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0


In [12]:
test_data.head()

Unnamed: 0,id_obs,coder_1,coder_2,consensus,sec_create_1,sec_create_2,sec_review_1,sec_review_2,created_at,text,...,THREAT_EXPERIMENTAL,date,tox_60,tox_70,tox_80,tox_90,insult_60,insult_70,insult_80,insult_90
0,4221,0,0,1,9,5,5,0,2021-09-01T20:35:50.000Z,@guillelarrain @bcentralchile @Jaime_Bassa @fe...,...,0.005544,2021-09-01,0,0,0,0,0,0,0,0
1,6373,0,0,1,11,5,4,0,2022-06-01T13:54:11.000Z,@AlGrano921 @rmorenoconcha @AgriculturaFM @gon...,...,0.005622,2022-06-01,0,0,0,0,0,0,0,0
2,13303,0,0,1,8,4,5,0,2021-01-07T01:35:33.000Z,@cherrerafe @bdelamaza Después Trump bloqueará...,...,0.005987,2021-01-07,0,0,0,0,0,0,0,0
3,18873,0,0,1,7,3,3,0,2021-07-04T15:58:37.000Z,@berfontaine @melnicksergio Q terrible.,...,0.005575,2021-07-04,0,0,0,0,0,0,0,0
4,21074,0,0,1,11,3,4,0,2022-03-03T12:39:46.000Z,@BRebolledoD17 Más gasto de dinero inutilmente,...,0.005461,2022-03-03,0,0,0,0,0,0,0,0


In [33]:
## Progress bar
progress_bar = tqdm(total=len(test_data), desc="Processing")

## List to store classification
out_toxicity_1 = []

## Time
start_time = datetime.now()

## Iterate and classify
for text in test_data["text"]:
    result = toxic_classifier_1(text)
    out_toxicity_1.append(result)
    progress_bar.update(1)
progress_bar.close()

## Time
end_time = datetime.now()
print("Time taken to classify:", end_time - start_time)

## Merge
flat_list_1 = [item for sublist in out_toxicity_1 for item in sublist]
df = pd.DataFrame(flat_list_1)
merged_tox_1 = pd.DataFrame(flat_list_1)

Processing: 100%|██████████| 1000/1000 [00:20<00:00, 47.81it/s]

Time taken to classify: 0:00:20.916105





In [34]:
merged_tox_1.head()

Unnamed: 0,label,score
0,NONTOXIC,0.868035
1,NONTOXIC,0.900176
2,NONTOXIC,0.667441
3,NONTOXIC,0.571407
4,NONTOXIC,0.870018


In [36]:
## Mapping labels
merged_tox_1['label'] = merged_tox_1['label'].map({'NONTOXIC': 0, 'TOXIC': 1})
# Ensure no NaN remains
if merged_tox_1["label"].isna().any():
    print("Warning: There are NaN values in the 'label' column after mapping.")

In [42]:
## Performance metrics
accuracy = accuracy_score(test_data["coder_1"], merged_tox_1["label"])
precision = precision_score(test_data["coder_1"], merged_tox_1["label"], average="binary")
recall = recall_score(test_data["coder_1"], merged_tox_1["label"], average="binary")
f1 = f1_score(test_data["coder_1"], merged_tox_1["label"], average="binary")
print("Accuracy:", accuracy, "Precision:", precision, "Recall", recall, "F1-Score", f1)

Accuracy: 0.77 Precision: 0.8792792792792793 Recall 0.7496159754224271 F1-Score 0.8092868988391376


## Ejercicio práctico
1. Aplicar los otros dos clasificadores al test set y comparar performance.
2. Evaluar el nivel de overfitting al aplicar los clasificadores en la data de protestas que ya es conocida para los modelos.
3. ¿Es igual el F1 obtenido al clasificar la data de protesta al reportado en el entrenamiento de los modelos? Argumente.
4. ¿Que otro average se podría utilizar para precision, recall y F1?