In [None]:
# import os

# def install_packages():
#     packages = [
#         "nltk",
#         "scikit-learn",
#         "pyLDAvis",
#         "gensim",
#         "matplotlib",
#         "wordcloud",
#         "seaborn",
#         "pandas",
#         "numpy",
#     ]
#     for package in packages:
#         os.system(f"pip install {package}")

# install_packages()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('punkt')
import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import train_test_split
import pyLDAvis
import numpy as np
import time
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch

In [None]:
# loading data

try:
    all_data = pd.read_csv('english_hate_speech.csv')
    all_data.head()
except Exception as e:
    print(f"Error loading data: {e}")

print("Shape of the data: ", all_data.shape) #inspecting the shape of the data

In [None]:
def tokenise_text(data):
    """
    Tokenise the text in the clean_text column
    """
    try:
        # label changed to a float when imported, changing it back
        data['oh_label'] = data['oh_label'].astype(int)
        data['clean_text'] = data['clean_text'].astype(str)

        # also lowercasing all words
        data['tokens'] = data['clean_text'].apply(lambda x: [word.lower() for word in x.split()]) #ensures all text is lowercase
        print("Tokenisation successful")
        return data
    except Exception as e:
        print(f"Tokenisation error: {e}")
        return None

all_data = tokenise_text(all_data)
all_data.head()

In [None]:

def lemmatize_text(data):
    """
    Lemmatises the tesxt data
    """
    try:
        lemm = WordNetLemmatizer() #using the inbuilt lemmatisation function

    # Lemmatize all words
        data['lemmatized'] = data['tokens'].apply(lambda x: [lemm.lemmatize(word) for word in x])
        print("Lemmatisation successful")
        return data
    except Exception as e:
        print(f"An error occurred during lemmatisation: {e}")
        return data

all_data=lemmatize_text(data=all_data)
all_data.head()

In [None]:
def remove_numbers(word_list):
    """
    removees any numbers from the text
    """
    return [word for word in word_list if not bool(re.search(r'\d', word))]

# Function to remove URLs from a list of words
def remove_urls(word_list):
    """
    Removes any URLs from the text
    """
    return [word for word in word_list if not (word.startswith('http') or word.startswith('www') or word.startswith('https'))]

all_data['lemmatized_no_numbers'] = all_data['lemmatized'].apply(remove_numbers)
all_data['lemmatized_clean'] = all_data['lemmatized_no_numbers'].apply(remove_urls)

all_data['tokenized_clean']=all_data['tokens'].apply(remove_numbers)
all_data['tokenized_clean']=all_data['tokenized_clean'].apply(remove_urls)
all_data['string_tokenized']=all_data['tokenized_clean'].apply(' '.join)
all_data.head()

In [None]:
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

In [None]:
# Count the number of rows where 'oh_label' is 1
count_label_1 = all_data[all_data['oh_label'] == 1].shape[0]

# Count the number of rows where 'oh_label' is 0
count_label_0 = all_data[all_data['oh_label'] == 0].shape[0]

print(f"Number of rows with oh_label 1: {count_label_1}")
print(f"Number of rows with oh_label 0: {count_label_0}")

In [None]:
# function to tokenize the lemmatized strings
def tokenize_texts(texts, max_length):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    tokenized_texts = tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokenized_texts

# Tokenize train and test texts

max_length = 128
train_tokenized_texts = tokenize_texts(train_data['string_tokenized'].tolist(), max_length)
test_tokenized_texts = tokenize_texts(test_data['string_tokenized'].tolist(), max_length)

# Prepare train and test inputs and labels
train_inputs = {
    'input_ids': train_tokenized_texts['input_ids'],
    'attention_mask': train_tokenized_texts['attention_mask']
}
train_labels = torch.tensor(train_data['oh_label'].values, dtype=torch.float32).unsqueeze(1)

test_inputs = {
    'input_ids': test_tokenized_texts['input_ids'],
    'attention_mask': test_tokenized_texts['attention_mask']
}
test_labels = torch.tensor(test_data['oh_label'].values, dtype=torch.float32).unsqueeze(1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import torch.nn as nn


# Transformer model
class CyberBullyingClassifier(nn.Module):
    def __init__(self):
        super(CyberBullyingClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        probability = self.sigmoid(logits)
        return probability

In [None]:
# Create model instance
model = CyberBullyingClassifier()
model.to(device)

In [None]:
def calculate_class_weights(y):
    unique_classes, class_counts = np.unique(y, return_counts=True)
    total_samples = len(y)
    class_weights = []

    for class_count in class_counts:
        class_weight = total_samples / (2.0 * class_count)
        class_weights.append( class_weight)

    return class_weights

In [None]:
# Convert class weights to a tensor
weights = calculate_class_weights(train_labels.numpy())
print(weights)
class_weights = torch.tensor([weights[1]/weights[0]],dtype=torch.float32).to(device=device)
print(class_weights)

In [None]:
# Defining optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
epochs = 100
total_steps = len(train_inputs['input_ids']) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Defining loss function
criterion = nn.BCELoss(weight=class_weights)

In [None]:
def train_model(model, train_inputs, train_labels, criterion, optimizer, scheduler, device, batch_size=32, epochs=epochs):
    model.train()
    best_loss=float('inf')
    wait=0
    epsilon=1e-8
    patience_counter=3
    
    for epoch in range(epochs):
        running_loss = 0.0
        for i in range(0, len(train_inputs['input_ids']), batch_size):
            inputs = {key: val[i:i+batch_size].to(device) for key, val in train_inputs.items()}
            labels = train_labels[i:i+batch_size].to(device)

            optimizer.zero_grad()

            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item() * batch_size

        epoch_loss = running_loss / len(train_labels)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}')
        
        if epoch_loss<best_loss-epsilon:
            best_loss=epoch_loss
            wait=0
        else:
            wait+=1
            if wait >=patience_counter:
                print(f'Stopping early at epoch {epoch+1} due to insignificant loss change.')
                break

# Training the model
train_model(model, train_inputs, train_labels, criterion, optimizer, scheduler, device)

In [None]:
from sklearn.metrics import accuracy_score
def evaluate_model(model, test_inputs, test_labels, device, batch_size=32):
    model.eval()
    preds = []
    true_labels = []
    with torch.no_grad():
        for i in range(0, len(test_inputs['input_ids']), batch_size):
            inputs = {key: val[i:i+batch_size].to(device) for key, val in test_inputs.items()}
            labels = test_labels[i:i+batch_size].to(device)

            outputs = model(**inputs)
            preds.extend(outputs.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    preds = np.array(preds) >= 0.5
    accuracy = accuracy_score(true_labels, preds)
    return accuracy

# Evaluating the model
accuracy = evaluate_model(model, test_inputs, test_labels, device)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# saving the model
torch.save(model.state_dict(), '/home/ara2/Desktop/Farhan_Bullishield_CSE498R/Created Models/english_bert_class_model_weights.pth')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


def sentence_preprocessing(sentence_list):
    '''Returns a list of lemmatized texts upon calling'''
    
    # initialize stopwords for english
    stop_words = stopwords.words('english')
    
    preprocessed_sentence_list=[]
    for sentence in sentence_list:
        # keep the type str
        sentence=str(sentence)
        
        # lowercasing the sentence and tokenising it
        words= word_tokenize(sentence.lower())
        
        # Filter out stopwords
        filtered_tokenized_words = [word for word in words if word not in stop_words]
        
        # lemmatize texts
        lemmatizer=WordNetLemmatizer()
        lemmatized_words=[lemmatizer.lemmatize(word) for word in filtered_tokenized_words] 
        

        # remove urls and numbers
        clean_sentence=remove_numbers(word_list=lemmatized_words)
        preprocessed_sentence=remove_urls(word_list=clean_sentence)
        preprocessed_sentence_list.append(preprocessed_sentence)
    
    return preprocessed_sentence_list

In [None]:
def analyze_sentiment(preprocessed_sentence_list):
    
    sentiment_score_list=[]
    intensity_analyser = SentimentIntensityAnalyzer()

    for i in range(len(preprocessed_sentence_list)):
        # convert list of words to string
        converted_string=" ".join(preprocessed_sentence_list[i])
        # analyze sentiment
        
        sentiment_score=intensity_analyser.polarity_scores(converted_string)['compound']
        sentiment_score_list.append(sentiment_score)
        
    return sentiment_score_list

In [None]:
# check score

sentence_list=["Alvi is really a good motherfucker","Fuck you motherfucker","I have loved you all my life"]

preprocessed_list=sentence_preprocessing(sentence_list)
sentiment_list=analyze_sentiment(preprocessed_list)

for i in range(len(sentence_list)):
    print(f"Sentence:{sentence_list[i]}\nSentiment Score:{sentiment_list[i]}\n\n")

In [None]:
def predict_custom_texts(model, tokenizer, texts, device):
    model.eval()
    tokenized_texts = tokenizer.batch_encode_plus(
        texts,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    inputs = {key: val.to(device) for key, val in tokenized_texts.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = outputs.cpu().numpy()
    return probabilities

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
# Predict custom texts
custom_probabilities = predict_custom_texts(model, tokenizer, sentence_list, device)

# Convert probabilities to binary predictions
custom_pred_labels = [1 if prob >= 0.5 else 0 for prob in custom_probabilities]

# Print predictions
for text, label in zip(sentence_list, custom_pred_labels):
    if label == 1:
        print(f'Text: "{text}" is predicted as cyberbullying.')
    else:
        print(f'Text: "{text}" is predicted as not cyberbullying.')