In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.6 MB/s[0m eta [36m0:00:0

In [3]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [4]:
# Load the dataset
train = pd.read_csv("dataset.csv")
test = pd.read_csv("data4.csv")

In [5]:
# Preprocess text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    clean_tokens = [token for token in tokens if token.isalnum()]
    return ' '.join(clean_tokens)

train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

In [6]:
# Load AraBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabert", num_labels=2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)



In [8]:
# Set max sequence length based on your dataset
MAX_LENGTH = 128

In [9]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(label)}

In [10]:
# Split dataset into train and validation sets
train, val_data = train_test_split(train, test_size=0.2, random_state=42)

In [11]:
# Create datasets and dataloaders
train_dataset = HateSpeechDataset(train['text'].tolist(), train['label'].tolist(), tokenizer, MAX_LENGTH)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [12]:
val_dataset = HateSpeechDataset(val_data['text'].tolist(), val_data['label'].tolist(), tokenizer, MAX_LENGTH)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [13]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
loss_fn = nn.CrossEntropyLoss()

In [15]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()

        # Pass the input data through the BERT model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

    # Evaluation on validation set after each epoch
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Pass the input data through the BERT model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            logits = outputs.logits
            val_loss += loss_fn(logits, labels).item()
            predictions = torch.argmax(logits, dim=1)
            total += labels.size(0)
            correct += (predictions == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {accuracy:.2f}%")

Epoch 1/5, Val Loss: 66.0646, Val Accuracy: 77.21%
Epoch 2/5, Val Loss: 58.5327, Val Accuracy: 81.10%
Epoch 3/5, Val Loss: 66.5789, Val Accuracy: 81.01%
Epoch 4/5, Val Loss: 69.9133, Val Accuracy: 81.28%
Epoch 5/5, Val Loss: 89.6008, Val Accuracy: 80.56%


In [16]:
# Test the model
test_dataset = HateSpeechDataset(test['text'].tolist(), test['label'].tolist(), tokenizer, MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [17]:
model.eval()
test_predictions = []
true_labels = []

In [18]:
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Pass the input data through the BERT model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

In [19]:
# Calculate metrics
precision = precision_score(true_labels, test_predictions)
recall = recall_score(true_labels, test_predictions)
accuracy = accuracy_score(true_labels, test_predictions)
conf_matrix = confusion_matrix(true_labels, test_predictions)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.3657142857142857
Recall: 0.5245901639344263
Accuracy: 0.8316733067729084
Confusion Matrix:
[[771 111]
 [ 58  64]]
