In [44]:
import torch
import easyocr
import regex
from pathlib import Path
import torch.nn as nn
from transformers import BertTokenizer, BertModel, AdamW
import bangla_nlp
import english_nlp
import string


In [70]:
text_reader = easyocr.Reader(['en','bn'])

In [75]:
filepath='Example Images/bengali-2.PNG'

In [76]:
file=Path(filepath)
if file.is_file():
    print(f"Got the image. Filepath: {filepath}")
    result_from_text = text_reader.readtext(filepath)
    full_text=[]
    english_texts=[]
    bangla_texts=[]
    for (bbox, text, prob) in result_from_text:
        print(f'Text: {text}, Probability: {prob}')
        if(bool(regex.fullmatch(r'\P{L}*\p{Bengali}+(?:\P{L}+\p{Bengali}+)*\P{L}*', text))):
            full_text.append(text + "। ")
            bangla_texts.append(text)
        else:
            full_text.append(text+". ")
            english_texts.append(text)
    print(f"Bangla Texts: {bangla_texts}")
    print(f"English Texts: {english_texts}")   
        
else:
    print("There was no image found with the filepath")


Got the image. Filepath: Example Images/bengali-2.PNG
Text: শুয়োরের বাচ্চা তোকে একবার পেয়ে নেই খালি] দেখেনিব একদম], Probability: 0.2587815690071471
Bangla Texts: ['শুয়োরের বাচ্চা তোকে একবার পেয়ে নেই খালি] দেখেনিব একদম]']
English Texts: []


In [48]:
# Bangla Bert model
class CyberBullyingClassifierBangla(nn.Module):
    def __init__(self):
        super(CyberBullyingClassifierBangla, self).__init__()
        self.bert = BertModel.from_pretrained('sagorsarker/bangla-bert-base')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        probability = self.sigmoid(logits)
        return probability

In [49]:
# Transformer model
class CyberBullyingClassifierEnglish(nn.Module):
    def __init__(self):
        super(CyberBullyingClassifierEnglish, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        probability = self.sigmoid(logits)
        return probability

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [51]:
bangla_model=CyberBullyingClassifierBangla()
bangla_model.to(device=device)
bangla_model.load_state_dict(torch.load('Created Models/bangla_bert.pth'))
bangla_model.eval()

CyberBullyingClassifierBangla(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(102025, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [52]:
english_model=CyberBullyingClassifierEnglish()
english_model.to(device=device)
english_model.load_state_dict(torch.load('Created Models/english_bert.pth'))
english_model.eval()

CyberBullyingClassifierEnglish(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [53]:
def bangla_string_preprocessing(string):
    clean_punctuation=bangla_nlp.clean_punctuations(text=string)
    clean_emoji=bangla_nlp.clean_emoji(text=clean_punctuation)
    clean_url=bangla_nlp.clean_url_and_email(clean_emoji)
    clean_text=bangla_nlp.clean_digits(text=clean_url)
    return clean_text

In [54]:
def english_string_preprocessing(text):
    clean_emoji=english_nlp.remove_emoji(text)
    clean_punctuation=clean_emoji.translate(str.maketrans('', '', string.punctuation))
    clean_url=english_nlp.remove_urls(clean_punctuation)
    clean_text=english_nlp.remove_numbers(clean_url)
    return clean_text

In [55]:
def predict_bangla_cyberbullying(model, texts, device):
    tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
    model.eval()
    clean_texts=[]
    for i in texts:
        clean_texts.append(bangla_string_preprocessing(i))
    tokenized_texts = tokenizer.batch_encode_plus(
        clean_texts,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    inputs = {key: val.to(device) for key, val in tokenized_texts.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = outputs.cpu().numpy()
    return probabilities

In [56]:
def predict_english_cyberbullying(model, texts, device):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
    model.eval()
    clean_texts=[]
    for i in texts:
        clean_texts.append(english_string_preprocessing(i))
    tokenized_texts = tokenizer.batch_encode_plus(
        clean_texts,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    inputs = {key: val.to(device) for key, val in tokenized_texts.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = outputs.cpu().numpy()
    return probabilities

In [57]:
def checkCyberBullying(full_text,english_texts,bangla_texts):
    custom_probabilities=None
    # check if there are any bangla or english texts in the list. if there are, pass them to respective models
    if(len(english_texts)>0):
        # Predict cyberBullying for english texts
        custom_probabilities = predict_bangla_cyberbullying(english_model, english_texts, device)
    if(len(bangla_texts)>0):
        # Predict cyberBullying for bangla texts
        custom_probabilities = predict_bangla_cyberbullying(bangla_model, full_text, device)   


    # Convert probabilities to binary predictions
    custom_pred_labels = [1 if prob >= 0.5 else 0 for prob in custom_probabilities]

    # Print predictions
    for text, label in zip(full_text, custom_pred_labels):
        if label == 1:
            print(f'Text: "{text}" is predicted as cyberbullying.')
        else:
            print(f'Text: "{text}" is predicted as not cyberbullying.')

In [77]:
checkCyberBullying(full_text=full_text,english_texts=english_texts,bangla_texts=bangla_texts)

Text: "শুয়োরের বাচ্চা তোকে একবার পেয়ে নেই খালি] দেখেনিব একদম]। " is predicted as cyberbullying.
