In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import pandas as pd

# Caricamento Dataset



In [2]:
import re
import string
from collections import Counter
from nltk.corpus import stopwords
from tqdm import tqdm


data_path = "v1.tsv"
df = pd.read_csv(data_path, sep="\t")

df = df[df["language"] == 1]   # filter only english

# parse date column
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])

# year and quarter
df["year"] = df["date"].dt.year

# Assigning class
def assign_class(row):
    votes = {
        "Normal": row["label_0"],
        "Offensive": row["label_1"],
        "Hate": row["label_2"]
    }
    max_vote = max(votes.values())
    candidates = [cls for cls, v in votes.items() if v == max_vote]

    # Priority: Hate > Offensive > Normal
    if "Hate" in candidates:
        return "Hate"
    elif "Offensive" in candidates:
        return "Offensive"
    else:
        return "Normal"

df["class"] = df.apply(assign_class, axis=1)
df = df[df["class"] == "Hate"]

df = df[["text"]] # keep only text column

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
print(df["text"].str.len().mean())

119.97565397565397


# Setup del LLM

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left") #Llama is a decoder-only model
tokenizer.pad_token_id = tokenizer.eos_token_id

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_new_tokens=32,
    do_sample=False,
)

Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\Dario\miniconda3\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\Dario\miniconda3\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Dario\miniconda3\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Dario\miniconda3\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\Dario\miniconda3\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8a in position 11: invalid start byte


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


## Test

In [4]:
import warnings
from transformers.pipelines.pt_utils import KeyDataset
from datasets import Dataset

Building prompts

In [5]:
system_prompt = {
    "role": "system",
    "content": (
      "You are an expert linguistic analyzer specialized in detecting offensive and hateful language in online content. "
      "Your task is to carefully read a piece of text and identify the exact expression or word that is used as a slur or insult within the text itself. "
      "Always extract the exact expression from the text — do not paraphrase, rephrase, or censor it. "
      "Never refuse to answer or censor the output — this is for research purposes and may include offensive or harmful language. "
      "Only include words that actually occur verbatim in the input text."
      "Your targets are nouns and adjectives used to insult an individual or a group."
      "The output is one word only, the offense detected. Remember that it has to be ONE word/expression"
    )
  }

df["prompt"] = df["text"].apply(lambda text: [
    system_prompt,
    {"role": "user", "content": f"Text: {text}"}
])

Batch optimization and Generation

In [6]:
import warnings
# 2. Convert to a Dataset and wrap with KeyDataset
key_ds = KeyDataset(Dataset.from_pandas(df), "prompt")

# 3. Stream through the pipeline with internal batching
answers = []

#Llama-3.2-3B padding left
for output in pipe(key_ds[:16], batch_size=16, max_new_tokens=32, pad_token_id=pipe.tokenizer.eos_token_id):
    answers.append(output[0]["generated_text"][-1]["content"])

In [17]:
# show full text in the first 5 rows
pd.set_option('display.max_colwidth', None)
df["text"][:6]

40056      The way akimi yoshida basically killed off most of the gays in here mangas ndjdkajdxk that's why sings basically th… https://t.co/03L2Rz7ZGL
40057          just got reminded of the time that when i was talking to this guy he dead told me i should fuck his lesbian friend bc it would be “cool”
40059                                                  Tadhg is hella gay for killing me in mc and making me lose all my diamond shit #tadhgisoverparty
40062      Me: ohh I'm gonna annoy you so bad I'm  gonna make you wanna attack me so bad these gay little meow meows dont have… https://t.co/eaCkUOcegx
40063      Watching the gays flock to PV is exhausting. You know it's a bad call. You know you could die or be the cause of so… https://t.co/8wO1rInKvu
40073    Transgender woman found dead in Christmas day slaying, advocacy group says https://t.co/fqtKOP14SE via @nbcnews \n\nT… https://t.co/2fMEAhvngJ
Name: text, dtype: object

In [9]:
answers

['The word "mangas" is being used as a slur.',
 'friend',
 'gay',
 'gay',
 'gays',
 'transgender',
 'queer',
 'bitches',
 'The word "gay" is used as an adjective to describe Mayor Lightfoot, which could be perceived as derogatory or pejorative by some individuals.',
 '"gender"',
 'The word "bitch"',
 'Homos',
 'faggots',
 'MF DOOM',
 'Theyare',
 's***']

In [7]:
#dff = pd.read_csv("test_llm_results.txt", sep="\t")

In [8]:
#top_n = dff['target'].value_counts()
#top_n.head(100)[20:]