<a href="https://colab.research.google.com/github/DilkiIshara/Hate-Speech-Detection-System/blob/main/Hate%20Speech%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install scikit-learn
!pip install torch torchvision torchaudio




In [None]:
import csv
import re
import os
import time
from collections import defaultdict as ddict

import nltk
import numpy as np
import pandas as pd
import torch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
# from transformers import (AdamW, XLMRobertaForSequenceClassification,
#                           XLMRobertaTokenizer, get_linear_schedule_with_warmup)
from transformers import XLMRobertaTokenizer, XLMRobertaModel, MarianMTModel, MarianTokenizer, AutoTokenizer,AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


# Load tokenizer and model for translation (English to Sinhala)
translation_model_name = "thilina/mt5-sinhalese-english"
translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)

# Paths for English dataset and output
english_data_path = '/content/drive/MyDrive/HSDData/EnglishDataSet.csv'
sinhala_data_path = '/content/drive/MyDrive/HSDData/SinhalaDataSet.tsv'
translated_data_dir = '/content/drive/MyDrive/HSDData/Translated_Batches/'
translated_sinhala_file = '/content/drive/MyDrive/HSDData/Translated_Sinhala.csv'

os.makedirs(translated_data_dir, exist_ok=True)

# Translate function with retries and delay
def translate_to_sinhala(text):
    retries = 2
    for attempt in range(retries):
        try:
            inputs = translation_tokenizer.encode(text, return_tensors="pt")
            translated_tokens = translation_model.generate(inputs, max_length=1000)
            translated_text = translation_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

            time.sleep(1)  # Add delay to avoid rate limiting
            return translated_text
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for text: {text} - {e}")
            if attempt < retries - 1:
                time.sleep(2)  # Wait before retrying
            else:
                print(f"Translation failed after {retries} attempts for text: {text}")
                return text  # Return original text if all retries fail

# Translate dataset in chunks
def translate_in_chunks(data, chunk_size=1000):
    for i in range(0, len(data), chunk_size):
        chunk = data[i:i + chunk_size]
        chunk_file = os.path.join(translated_data_dir, f'translated_ML_batch_{i // chunk_size + 1}.csv')

        if os.path.exists(chunk_file):
            print(f"Batch {i // chunk_size + 1} already translated, skipping.")
            continue

        print(f"Translating batch {i // chunk_size + 1}...")
        chunk['translated_text'] = chunk['text'].apply(translate_to_sinhala)
        chunk[['translated_text', 'label']].to_csv(chunk_file, index=False)
        print(f"Batch {i // chunk_size + 1} saved to {chunk_file}")

# Check if the combined translated file exists
if not os.path.exists(translated_sinhala_file):
    # Load the English dataset
    english_data = pd.read_csv(english_data_path)

    # Translate in chunks
    #translate_in_chunks(english_data, chunk_size=1000)

    # Combine translated batches
    translated_batches = [
        pd.read_csv(os.path.join(translated_data_dir, f))
        for f in sorted(os.listdir(translated_data_dir))
        if f.startswith('translated_ML_batch_')
    ]
    english_data_translated = pd.concat(translated_batches, ignore_index=True)

    # Save combined translated file
    english_data_translated.to_csv(translated_sinhala_file, index=False)
    print(f"Combined translated dataset saved to {translated_sinhala_file}")
else:
    print(f"Combined translated dataset already exists at {translated_sinhala_file}")
    english_data_translated = pd.read_csv(translated_sinhala_file)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Combined translated dataset already exists at /content/drive/MyDrive/HSDData/Translated_Sinhala.csv


Load sinhala Data set and Combine those

In [None]:
combined_data_file = '/content/drive/MyDrive/HSDData/Combined_Sinhala_Data.csv'
sinhala_data = pd.read_csv(sinhala_data_path, sep='\t')
# Convert Sinhala labels from NOT/OFF to numeric
sinhala_data['label'] = sinhala_data['label'].map({'NOT': 0, 'OFF': 1})

# Select only the necessary columns from SinhalaDataSet
sinhala_data = sinhala_data[['text', 'label']]

# Rename column in Translated_Sinhala
english_data_translated = english_data_translated.rename(columns={"translated_text": "text"})

# Combine datasets
combined_data = pd.concat([sinhala_data, english_data_translated], ignore_index=True)

# Save combined dataset
combined_data.to_csv(combined_data_file, index=False)
print(f"Combined dataset saved to {combined_data_file}")

Combined dataset saved to /content/drive/MyDrive/HSDData/Combined_Sinhala_Data.csv


#Preprocessing

In [None]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

sinhala_stopwords_path = '/content/drive/MyDrive/HSDData/Sinhala_Stop_Words.txt'

# Load Sinhala stopwords from file
with open(sinhala_stopwords_path, 'r', encoding='utf-8') as f:
    sinhala_stopwords = set(f.read().splitlines())
print(f"Loaded {len(sinhala_stopwords)} Sinhala stopwords from file.")

# Preprocess text based on language
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english') and word not in sinhala_stopwords])
    return text

# Apply preprocessing to text
combined_data['cleaned_text'] = combined_data['text'].apply(preprocess_text)

Loaded 166 Sinhala stopwords from file.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Split into features and labels
X = combined_data['cleaned_text']
y = combined_data['label']

# Ensure labels are numeric
y = y.astype(int)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize and prepare data for XLM-R
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize training and test sets
X_train_tokenized = tokenizer(list(X_train), padding=True, truncation=True, return_tensors="pt")
X_test_tokenized = tokenizer(list(X_test), padding=True, truncation=True, return_tensors="pt")

# Load pre-trained XLM-R model
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)

# Prepare DataLoader for training
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure labels are tensors with correct type
        return item

train_dataset = HateSpeechDataset(X_train_tokenized, list(y_train))
test_dataset = HateSpeechDataset(X_test_tokenized, list(y_test))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Set Up

In [None]:


from torch.optim import AdamW
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss
import torch


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


def train_model():
    model.train()
    loss_fn = CrossEntropyLoss()
    for epoch in range(3):  # Train for 3 epochs
        for batch in train_loader:
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch['labels'])
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
        model.save_pretrained(f"/content/drive/MyDrive/HateSpeech_XLMR_Model_Epoch_{epoch}")
        tokenizer.save_pretrained(f"/content/drive/MyDrive/HateSpeech_XLMR_Model_Epoch_{epoch}")

train_model()




NameError: name 'model' is not defined

In [None]:
# Evaluation
def evaluate_model():
    model.eval()
    y_preds = []
    y_trues = []
    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            y_preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
            y_trues.extend(batch['labels'].tolist())
    print("Classification Report:")
    print(classification_report(y_trues, y_preds))
    print("Accuracy Score:", accuracy_score(y_trues, y_preds))

# Run training and evaluation
train_model()
evaluate_model()

# Save final model
model.save_pretrained("/content/drive/MyDrive/HateSpeech_XLMR_Model")
tokenizer.save_pretrained("/content/drive/MyDrive/HateSpeech_XLMR_Model")