In [26]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [27]:
model_name = "piubamas/beto-contextualized-hate-speech"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
df = pd.read_csv('../data/raw/elecciones_argentina.csv')

print(len(df))

55374


In [31]:
filtered_df = df.dropna(subset=['text'])

print(len(filtered_df))

52476


In [32]:
id2label = [model.config.id2label[k] for k in range(len(model.config.id2label))]

def predict(*args):
    try:
        encoding = tokenizer.encode_plus(*args)

        inputs = {
            k: torch.LongTensor(encoding[k]).reshape(1, -1) for k in {"input_ids", "attention_mask", "token_type_ids"}
        }

        output = model.forward(
            **inputs
        )

        chars = list(zip(id2label, list(output.logits[0].detach().cpu().numpy() > 0)))

        return [char for char, pred in chars if pred]
    
    except Exception as e:
        # Handle the error appropriately
        print("An error occurred:", str(e))
        return np.nan

In [33]:
texts = filtered_df['text'].to_list()

outputs = []
for text in tqdm(texts):
    output = predict(text)
    outputs.append(output)

filtered_df['label'] = outputs

100%|██████████| 52476/52476 [1:06:43<00:00, 13.11it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['label'] = outputs


In [36]:
filtered_df.to_csv('../data/raw/datos_etiquetados.csv', index=False)