In [22]:
!pip install -q transformers datasets evaluate accelerate ipywidgets -U


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [23]:
import pandas as pd
import torch
import numpy as np
import evaluate
from datasets import load_dataset,DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import ClassLabel

# Set the device for training (will automatically use your GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [24]:
raw_dataset = load_dataset("csv", data_files="../dataset/dataset1.csv")
all_columns = list(raw_dataset['train'].features.keys())
columns_to_keep = ['text', 'label']
columns_to_remove = [col for col in all_columns if col not in columns_to_keep]

# Remove the unwanted columns from all splits in the DatasetDict
processed_dataset = raw_dataset.remove_columns(columns_to_remove)
print(processed_dataset['train'].features)
print(processed_dataset['train'][0])

{'text': Value('string'), 'label': Value('string')}
{'text': 'dalits are lowlives', 'label': 'hate'}


In [25]:
def map_labels_to_int(example):
    # Map 'hate' to 1 and 'no-hate' to 0
    example['label'] = 1 if example['label'] == 'hate' else 0
    return example

# Apply the mapping function to the entire dataset
processed_dataset = processed_dataset.map(map_labels_to_int)
class_label_feature = ClassLabel(num_classes=2, names=['no-hate', 'hate'])

processed_dataset = processed_dataset.cast_column("label", class_label_feature)


print("\nProcessed dataset features:")
print(processed_dataset['train'].features)
print(processed_dataset['train'][0])




Processed dataset features:
{'text': Value('string'), 'label': ClassLabel(names=['no-hate', 'hate'])}
{'text': 'dalits are lowlives', 'label': 1}


## Split the 'train' dataset into training (80%) and a test set (20%)


In [26]:
train_test_split = processed_dataset["train"].train_test_split(test_size=0.2, seed=42)
test_validation_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

# Create the final DatasetDict
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_validation_split['train'],
    'test': test_validation_split['test']
})

print("\nDataset splits:")
print(split_dataset)





Dataset splits:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 32915
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4114
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4115
    })
})


In [27]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenization to all splits of the dataset
tokenized_datasets = split_dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

print("\nTokenized dataset sample:")
print(tokenized_datasets["train"][0])



Map: 100%|██████████| 32915/32915 [00:06<00:00, 5150.29 examples/s]
Map: 100%|██████████| 4114/4114 [00:00<00:00, 5420.66 examples/s]
Map: 100%|██████████| 4115/4115 [00:00<00:00, 5194.83 examples/s]


Tokenized dataset sample:
{'labels': tensor(1), 'input_ids': tensor([  101,  1996, 21288,  2323,  2022,  7917,  1999,  2885,  1998,  2151,
        10644,  2923,  1010,  4314,  2406,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
  




In [28]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to(device)

# Define evaluation metrics
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-hate-speech-checker",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start training
print("\nStarting training...")
trainer.train()

# --- Model Evaluation ---
print("\nEvaluating the model on the test set...")
eval_results = trainer.evaluate(tokenized_datasets["test"])

print("\nEvaluation results:")
print(eval_results)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5002,0.526922,0.755469,0.794945,0.712199,0.899446
2,0.3677,0.48671,0.780992,0.80106,0.768319,0.836716
3,0.2917,0.706569,0.782207,0.796917,0.783422,0.810886



Evaluating the model on the test set...



Evaluation results:
{'eval_loss': 0.4675636887550354, 'eval_accuracy': 0.7934386391251519, 'eval_f1': 0.8179091688089117, 'eval_precision': 0.7934330839567747, 'eval_recall': 0.8439434129089302, 'eval_runtime': 83.2374, 'eval_samples_per_second': 49.437, 'eval_steps_per_second': 6.187, 'epoch': 3.0}


In [30]:
save_directory = model_checkpoint + "-hate-speech-checker/best-weight"
trainer.save_model(save_directory)

In [31]:
del trainer