In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset
from bitsandbytes.nn import Linear8bitLt
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# train_df = pd.read_csv('data/hate_train.csv')
# with open('data/hate_test_data.txt', 'r', encoding='utf-8') as file:
#     test_sentences = file.read().splitlines()
#
# test_df = pd.DataFrame({'sentence': test_sentences})
train_df = pd.read_csv('data/hate_train_cleaned.csv')
test_df = pd.read_csv('data/hate_test_cleaned.csv')

In [3]:
train_df.head()

Unnamed: 0,sentence,label
0,Dla mnie faworytem do tytułu będzie Cracovia. ...,0
1,Brawo ty Daria kibic ma być na dobre i złe,0
2,"Super, polski premier składa kwiaty na grobach...",0
3,Musi. Innej drogi nie mamy.,0
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0


In [4]:
test_df.head()

Unnamed: 0,sentence
0,"Spoko, jak im Duda z Morawieckim zamówią po pi..."
1,"Ale on tu nie miał szans jej zagrania, a ta 'p..."
2,"No czy Prezes nie miał racji, mówiąc,ze to są ..."
3,Przecież to nawet nie jest przewrotka 😂
4,Owszem podatki tak. Ale nie w takich okoliczno...


In [5]:
# train_df_cleaned = train_df.copy()
# test_df_cleaned = test_df.copy()

def remove_anonymized_account(text):
    if isinstance(text, str):
        return text.replace('@anonymized_account ', '')
    return text

In [6]:
train_df = train_df[~train_df['sentence'].str.startswith('RT ', na=False)]
print(f"Liczba próbek po usunięciu retweetów: {len(train_df)}")

Liczba próbek po usunięciu retweetów: 9387


In [7]:
# train_df_cleaned['sentence'] = train_df_cleaned['sentence'].apply(remove_anonymized_account)
# test_df_cleaned['sentence'] = test_df_cleaned['sentence'].apply(remove_anonymized_account)

In [8]:
# train_df_cleaned.to_csv('data/hate_train_cleaned.csv', index=False)
# test_df_cleaned.to_csv('data/hate_test_cleaned.csv', index=False)

In [9]:
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['label'])

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_df)

Konfiguracja modelu i tokenizera

In [10]:
model_name = "szymonrucinski/Curie-7B-v1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 8448/8448 [00:00<00:00, 25305.78 examples/s]
Map:   0%|          | 0/939 [00:00<?, ? examples/s]
Map: 100%|██████████| 939/939 [00:00<00:00, 23031.61 examples/s]
Map: 100%|██████████| 939/939 [00:00<00:00, 23031.61 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 23480.40 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 23480.40 examples/s]


Konfiguracja LoRA dla efektywnego fine-tuningu

In [11]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=bnb_config
)

model.config.pad_token_id = tokenizer.pad_token_id

model = get_peft_model(model, peft_config)

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at szymonrucinski/Curie-7B-v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.69s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at szymonrucinski/Curie-7B-v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="./models/curie-hate-speech",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    fp16=True,
)

Definiowanie metryki do ewaluacji

In [15]:
def compute_metrics(eval_preds):
    import numpy as np
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

Inicjalizowanie Trainera i rozpoczęcie treningu

In [17]:
print("Rozpoczynanie treningu...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Rozpoczynanie treningu...


ValueError: Cannot handle batch sizes > 1 if no padding token is defined.

Zapisanie modelu i ewaluacja danych

In [None]:
model.save_pretrained("./models/curie-hate-speech-final")
tokenizer.save_pretrained("./models/curie-hate-speech-final")

In [None]:
print("Ewaluacja modelu...")
eval_results = trainer.evaluate()
print(f"Wyniki ewaluacji: {eval_results}")

In [None]:
raw_predictions = trainer.predict(test_dataset)
predictions = raw_predictions.predictions.argmax(axis=1)

test_df_with_predictions = test_df.copy()

output_path = 'pred.csv'
output_df = pd.DataFrame({'prediction': predictions})
output_df.to_csv(output_path, index=False, header=False)

print(output_df.head(5))

hate_speech_count = sum(predictions)
hate_speech_percentage = (hate_speech_count / len(predictions)) * 100
print(f"\nLiczba tekstów sklasyfikowanych jako mowa nienawiści: {hate_speech_count} ({hate_speech_percentage:.2f}%)")
