In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "piubamas/beto-contextualized-hate-speech"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
df = pd.read_csv("../data/raw/elecciones_argentina.csv")

print(len(df))

84622


In [4]:
filtered_df = df.dropna(subset=["text"])

print(len(filtered_df))

79887


In [5]:
id2label = [model.config.id2label[k] for k in range(len(model.config.id2label))]


def predict(*args):
    try:
        encoding = tokenizer.encode_plus(*args)

        inputs = {
            k: torch.LongTensor(encoding[k]).reshape(1, -1)
            for k in {"input_ids", "attention_mask", "token_type_ids"}
        }

        output = model.forward(**inputs)

        chars = list(zip(id2label, list(output.logits[0].detach().cpu().numpy() > 0)))

        return [char for char, pred in chars if pred]

    except Exception as e:
        # Handle the error appropriately
        print("An error occurred:", str(e))
        return np.nan

In [6]:
texts = filtered_df["text"].to_list()

outputs = []
for text in tqdm(texts):
    output = predict(text)
    outputs.append(output)

filtered_df["label"] = outputs

100%|██████████| 79887/79887 [1:22:22<00:00, 16.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["label"] = outputs


In [9]:
filtered_df.head(3)

Unnamed: 0,tweet_id,username,url,text,image,video,date,reply,retweet,like,...,is_rt,rt_by,raw,query_type,scraped_at,event,date_cleansed,date_local,dt_date,label
0,1708973726455181497,@soyingridbeck,https://x.com/soyingridbeck/status/17089737264...,Sólo @SergioMassa y @myriambregman abordaron a...,,,2023-10-02T22:34:22+00:00,0,3,17,...,False,,{'html_tweet': '𝗜𝗻𝗴𝗿𝗶𝗱 𝗕𝗲𝗰𝗸 @soyingridbeck·Oct...,from: @soyingridbeck,2023-10-10T17:43:36.448550+00:00,1er debate,2023-10-02 22:34:22+00:00,2023-10-02 19:34:22-03:00,2023-10-02,[]
1,1708809681924243938,@soyingridbeck,https://x.com/soyingridbeck/status/17088096819...,¿Irse a Marbella en un yate de lujo?,,,2023-10-02T11:42:30+00:00,1,0,0,...,False,,{'html_tweet': '𝗜𝗻𝗴𝗿𝗶𝗱 𝗕𝗲𝗰𝗸 @soyingridbeck·Oct...,from: @soyingridbeck,2023-10-10T17:43:36.612322+00:00,1er debate,2023-10-02 11:42:30+00:00,2023-10-02 08:42:30-03:00,2023-10-02,[]
2,1708810804101853498,@kgalperin,https://x.com/kgalperin/status/170881080410185...,"Me asombra esa interpretación, q expresa muy b...",,,2023-10-02T11:46:58+00:00,0,0,1,...,False,,{'html_tweet': 'Karina Galperin@kgalperin·Oct ...,to: @soyingridbeck,2023-10-10T18:22:43.429345+00:00,1er debate,2023-10-02 11:46:58+00:00,2023-10-02 08:46:58-03:00,2023-10-02,[]


In [8]:
filtered_df.to_csv("../data/processed/datos_etiquetados.csv", index=False)