In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import matplotlib.pyplot as plt
import bangla_nlp
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df=pd.read_csv('Bengali hate speech .csv')
df.head()

In [None]:
print("Hate-Non hate distribution")
df['hate'].value_counts()

In [None]:
df_filtered=df[df['hate']==1]
print("Number of hate comments in every category")
df_filtered['category'].value_counts()

In [None]:

# remove punctuations
df['clean_punctuation']=df['sentence'].apply(bangla_nlp.clean_punctuations)
df['clean_emoji']=df['clean_punctuation'].apply(bangla_nlp.clean_emoji)
df['clean_text']=df['clean_emoji'].apply(bangla_nlp.clean_url_and_email)
df['clean_text']=df['clean_text'].apply(bangla_nlp.clean_digits)
drop_columns=['clean_punctuation','clean_emoji']
df.drop(columns=drop_columns,inplace=True)
df.head()


In [None]:
df['tokenized_words']=df['clean_text'].apply(bangla_nlp.word_tokenize_texts)
df.head()

In [None]:
df['no_stopword_tokenized_words']=df['tokenized_words'].apply(bangla_nlp.remove_stopwords_from_tokens)
df.head()

In [None]:
value_counts=df['hate'].value_counts()
# Plot a pie chart
value_counts.plot.pie(autopct='%.1f%%')
# Add a title
plt.title("Data Distribution Pie Chart")
# Show the plot
plt.show()

In [None]:
sns.set(font_scale=1.4)
df['hate'].value_counts().plot(kind='barh', figsize=(9, 3))
plt.xlabel("Number of Comments", labelpad=12)
plt.ylabel("Sentiment Class", labelpad=12)
plt.yticks(rotation = 45)
plt.title("Data Distribution", y=1.02);

In [None]:
# Calculate the Review of each of the Review
df['ReviewLength'] = df.sentence.apply(lambda x:len(x.split()))
frequency = dict()
for i in df.ReviewLength:
    frequency[i] = frequency.get(i, 0)+1

plt.bar(frequency.keys(), frequency.values(), color ="b")
plt.xlim(1, 135)
# in this notbook color is not working but it should work.
plt.xlabel('Lenght of the Texts')
plt.ylabel('Frequency')
plt.title('Length-Frequency Distribution')
plt.show()
print(f"Maximum Length of a review: {max(df.ReviewLength)}")
print(f"Minimum Length of a review: {min(df.ReviewLength)}")
print(f"Average Length of a reviews: {round(np.mean(df.ReviewLength),0)}")

In [None]:
# count the number of samples in each class
class_counts = df['hate'].value_counts()
print(class_counts)

In [None]:
# Split dataset into training and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
# Tokenize the data
def tokenize_texts(texts, max_length):
    tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
    tokenized_texts = tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokenized_texts

max_length=128

train_tokenized_texts = tokenize_texts(train_data['clean_string_words'].tolist(), max_length)
test_tokenized_texts = tokenize_texts(train_data['clean_string_words'].tolist(), max_length)


In [None]:
# Prepare train and test inputs and labels
train_inputs = {
    'input_ids': train_tokenized_texts['input_ids'],
    'attention_mask': train_tokenized_texts['attention_mask']
}

train_labels = torch.tensor(train_data['hate'].values,dtype=torch.float32).unsqueeze(1)

test_inputs = {
    'input_ids': test_tokenized_texts['input_ids'],
    'attention_mask': test_tokenized_texts['attention_mask']
}

test_labels = torch.tensor(test_data['hate'].values,dtype=torch.float32).unsqueeze(1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels.cpu().numpy()),
    y=train_labels.cpu().numpy()
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)


In [None]:
# Bangla Bert model
class CyberBullyingClassifier(nn.Module):
    def __init__(self):
        super(CyberBullyingClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('sagorsarker/bangla-bert-base')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        probability = self.sigmoid(logits)
        return probability

In [None]:
# Create model instance
model = CyberBullyingClassifier()
model.to(device)

In [None]:
# Defining optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 100
total_steps = len(train_inputs['input_ids']) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Update the loss function to use class weights
criterion = nn.BCELoss(weight=class_weights)


In [None]:
def train_model(model, train_inputs, train_labels, criterion, optimizer, scheduler, device, batch_size=32, epochs=epochs):
    model.train()
    best_loss=float('inf')
    wait=0
    epsilon=1e-8
    patience_counter=3
    
    for epoch in range(epochs):
        running_loss = 0.0
        for i in range(0, len(train_inputs['input_ids']), batch_size):
            inputs = {key: val[i:i+batch_size].to(device) for key, val in train_inputs.items()}
            labels = train_labels[i:i+batch_size].to(device)

            optimizer.zero_grad()

            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item() * batch_size

        epoch_loss = running_loss / len(train_labels)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}')
        
        if epoch_loss<best_loss-epsilon:
            best_loss=epoch_loss
            wait=0
        else:
            wait+=1
            if wait >=patience_counter:
                print(f'Stopping early at epoch {epoch+1} due to insignificant loss change.')
                break

# Training the model
train_model(model, train_inputs, train_labels, criterion, optimizer, scheduler, device)

In [None]:
# saving the model
torch.save(model.state_dict(), 'G:\OneDrive - northsouth.edu\CODES\PROJECTS\PROJECT - Bullishield\Created Models/bangla_bert_class_imbalanced_wieghts.pth')