In [1]:
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import bangla_nlp
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Bangla Bert model
class CyberBullyingClassifierBangla(nn.Module):
    def __init__(self):
        super(CyberBullyingClassifierBangla, self).__init__()
        self.bert = BertModel.from_pretrained('sagorsarker/bangla-bert-base')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        probability = self.sigmoid(logits)
        return probability

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
bangla_model=CyberBullyingClassifierBangla()
bangla_model.to(device=device)
bangla_model.load_state_dict(torch.load('Created Models/bangla_bert.pth'))

<All keys matched successfully>

In [5]:
def bangla_string_preprocessing(string):
    clean_punctuation=bangla_nlp.clean_punctuations(text=string)
    clean_emoji=bangla_nlp.clean_emoji(text=clean_punctuation)
    clean_url=bangla_nlp.clean_url_and_email(clean_emoji)
    clean_text=bangla_nlp.clean_digits(text=clean_url)
    return clean_text

In [6]:
def predict_bangla_cyberbullying(model, texts, device):
    tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
    model.eval()
    clean_texts=[]
    for i in texts:
        clean_texts.append(bangla_string_preprocessing(i))
    tokenized_texts = tokenizer.batch_encode_plus(
        clean_texts,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt'
    )
    inputs = {key: val.to(device) for key, val in tokenized_texts.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = outputs.cpu().numpy()
    return probabilities

In [7]:
def checkCyberBullying(text):
    probs=predict_bangla_cyberbullying(model=bangla_model,device=device,texts=text)
    flag=False
    print(probs)
    for i in probs:
        if(i>=0.5):
            flag=True
            break
    if(flag):
        print(f"Text: {text} is marked as cyberbullying")
    else:
        print(f"Text: {text} is not marked as cyberbullying")


In [9]:
text=["আমি তোমায় ভালোবাসি"]
checkCyberBullying(text=text)

[[0.0004358]]
Text: ['আমি তোমায় ভালোবাসি'] is not marked as cyberbullying
