In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [2]:
df = pd.read_csv('/kaggle/input/psudo-labeled-svc/psudo_labeled_svc.csv')
df.head()

Unnamed: 0,sentence,hate,category,token,vec,psudo_label
0,দিনের বাচ্চা কই বালই হয়ই,-1,religion,"['দিনের', 'বাচ্চা', 'কই', 'বালই', 'হয়ই']",[-2.70244231e-02 2.00643005e-01 3.61602905e-...,1
1,এইমাগিআগেথেকেবাইরেখাওয়ারঅববাস এককলায়মাগিরহয়না,-1,crime,"['এইমাগিআগেথেকেবাইরেখাওয়ারঅববাস', 'এককলায়মাগির...",[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0
2,মঝা দুইজনে করছো সাস্থি দুইজনে পেতে স্যারের দুশ...,-1,crime,"['মঝা', 'দুইজনে', 'করছো', 'সাস্থি', 'দুইজনে', ...",[-0.01142999 0.07491128 0.01610502 0.047364...,-1
3,ভিড়িও টা ভালো লাগল,0,religion,"['ভিড়িও', 'টা', 'ভালো', 'লাগল']",[-0.03091348 0.28345488 0.05813488 0.170068...,0
4,খেলোয়াড়দের বাংলাদেশের খেলোয়াড়দের সাদিনতা নাই!এ...,-1,sports,"['খেলোয়াড়দের', 'বাংলাদেশের', 'খেলোয়াড়দের', 'সা...",[-0.02181379 0.1577958 0.03383988 0.098523...,0


In [3]:
df.shape

(16262, 6)

In [4]:
df = df.drop(['hate', 'category', 'vec', 'token'], axis=1)
df.head()

Unnamed: 0,sentence,psudo_label
0,দিনের বাচ্চা কই বালই হয়ই,1
1,এইমাগিআগেথেকেবাইরেখাওয়ারঅববাস এককলায়মাগিরহয়না,0
2,মঝা দুইজনে করছো সাস্থি দুইজনে পেতে স্যারের দুশ...,-1
3,ভিড়িও টা ভালো লাগল,0
4,খেলোয়াড়দের বাংলাদেশের খেলোয়াড়দের সাদিনতা নাই!এ...,0


In [5]:
df.rename(columns={'psudo_label':'hate'}, inplace=True)
df.head()

Unnamed: 0,sentence,hate
0,দিনের বাচ্চা কই বালই হয়ই,1
1,এইমাগিআগেথেকেবাইরেখাওয়ারঅববাস এককলায়মাগিরহয়না,0
2,মঝা দুইজনে করছো সাস্থি দুইজনে পেতে স্যারের দুশ...,-1
3,ভিড়িও টা ভালো লাগল,0
4,খেলোয়াড়দের বাংলাদেশের খেলোয়াড়দের সাদিনতা নাই!এ...,0


In [6]:
df = df[df['hate']!=-1]

In [7]:
df.head()

Unnamed: 0,sentence,hate
0,দিনের বাচ্চা কই বালই হয়ই,1
1,এইমাগিআগেথেকেবাইরেখাওয়ারঅববাস এককলায়মাগিরহয়না,0
3,ভিড়িও টা ভালো লাগল,0
4,খেলোয়াড়দের বাংলাদেশের খেলোয়াড়দের সাদিনতা নাই!এ...,0
5,মাগিরে মাগির পুলারে দুইটারেই ডিম থ্যারাপি,1


In [8]:
df.shape

(14629, 2)

In [9]:
df['hate'].value_counts()

hate
0    8101
1    6528
Name: count, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14629 entries, 0 to 16261
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  14629 non-null  object
 1   hate      14629 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 342.9+ KB


In [11]:
tokenizer = BertTokenizer.from_pretrained('Kowsher/bangla-bert')

vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

In [12]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [13]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['sentence'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [14]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [15]:
labels = np.zeros((len(df), 2))
labels.shape

(14629, 2)

In [16]:
labels[np.arange(len(df)), df['hate'].values] = 1 # one-hot encoded target tensor

In [17]:
labels

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [18]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(2,), dtype=tf.float64, name=None))>

In [19]:
def MapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [20]:
dataset = dataset.map(MapFunction) # converting to required format for tensorflow dataset

In [21]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(2,), dtype=tf.float64, name=None))>

In [22]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [23]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 2), dtype=tf.float64, name=None))>

In [24]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [25]:
train_size

731

In [26]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [27]:
len(train_dataset)

731

In [28]:
len(val_dataset)

183

In [29]:
from transformers import TFBertModel

In [30]:
model = TFBertModel.from_pretrained('Kowsher/bangla-bert', from_pt=True) # bert base model with pretrained weights

pytorch_model.bin:   0%|          | 0.00/658M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertModel were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.p

In [31]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(2, activation='sigmoid', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1643581   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   44         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                       

In [32]:
# optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)

optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.BinaryCrossentropy()
acc = tf.keras.metrics.BinaryAccuracy('accuracy')

In [33]:
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [34]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [38]:
model.save('hate_speech_detection_model') 

In [45]:
loaded_model = tf.keras.models.load_model('hate_speech_detection_model')

In [46]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

In [47]:
input_text = ''

In [48]:
tokenizer = BertTokenizer.from_pretrained('Kowsher/bangla-bert')

In [49]:
tokenized_input_text = prepare_data(input_text, tokenizer)

In [50]:
probs = loaded_model.predict(tokenized_input_text)



In [51]:
probs

array([[0.13327132, 0.8667287 ]], dtype=float32)

In [52]:
probs[0]

array([0.13327132, 0.8667287 ], dtype=float32)

In [53]:
np.argmax(probs[0])

1

In [54]:
classes=['Not-Hate', 'Hate']

In [55]:
output = np.argmax(probs[0])
print(classes[output])

Hate


## Load Model

In [39]:
model = tf.keras.models.load_model('/kaggle/working/hate_speech_detection_model')

In [None]:
tokenizer = BertTokenizer.from_pretrained('Kowsher/bangla-bert')

In [40]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['Not-Hate', 'Hate']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [49]:
input_text = input('Enter text here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(model, processed_data=processed_data)
print(f"Predicted: {result}")

Enter text here:  বেশ্যা মাগি


Predicted: Hate


In [69]:
%cd /kaggle/working

/kaggle/working


In [73]:
from IPython.display import FileLinks

In [74]:
FileLinks(r'hate_speech_detection_model')

In [75]:
model.save('hate_speech_detection_model.h5') 

  saving_api.save_model(


In [76]:
model.save('hate_speech_detection_model.keras') 

In [78]:
!zip -r file.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/hate_speech_detection_model.keras (deflated 8%)
  adding: kaggle/working/hate_speech_detection_model/ (stored 0%)
  adding: kaggle/working/hate_speech_detection_model/assets/ (stored 0%)
  adding: kaggle/working/hate_speech_detection_model/keras_metadata.pb (deflated 96%)
  adding: kaggle/working/hate_speech_detection_model/fingerprint.pb (stored 0%)
  adding: kaggle/working/hate_speech_detection_model/saved_model.pb (deflated 92%)
  adding: kaggle/working/hate_speech_detection_model/variables/ (stored 0%)
  adding: kaggle/working/hate_speech_detection_model/variables/variables.data-00000-of-00001 (deflated 35%)
  adding: kaggle/working/hate_speech_detection_model/variables/variables.index (deflated 80%)
  adding: kaggle/working/hate_speech_detection_model.h5 (deflated 8%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)


In [81]:
!ls

file.zip		     hate_speech_detection_model.h5
hate_speech_detection_model  hate_speech_detection_model.keras


In [82]:
from IPython.display import FileLink
FileLink(r'file.zip')