In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load the data
data = pd.read_csv("D:\Downloads\hate_speech_dataset.csv")

# Shuffle the data
data = shuffle(data, random_state=8012023)

# Consider only 20 percent of the data
subset_data = data.sample(frac=0.2, random_state=8012023)

# Encode the labels
label_encoder = LabelEncoder()
subset_data['class'] = label_encoder.fit_transform(subset_data['class'])

# Split the data into train, validation, and test sets
train_data, test_data = train_test_split(subset_data, test_size=0.2, random_state=8012023)
train_data, valid_data = train_test_split(train_data, test_size=0.125, random_state=8012023)  # 10% of the original data for validation

# Check the shapes of the resulting datasets
print("Train data shape:", train_data.shape)
print("Validation data shape:", valid_data.shape)
print("Test data shape:", test_data.shape)


Train data shape: (1734, 3)
Validation data shape: (248, 3)
Test data shape: (496, 3)


In [10]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, GlobalMaxPooling1D, Conv1D, Bidirectional, GRU
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Combine all text data
all_texts = list(train_data['tweet']) + list(valid_data['tweet']) + list(test_data['tweet'])

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and pad sequences for train data
train_encoded_dict = tokenizer(all_texts,
                               add_special_tokens=True,
                               truncation=True,
                               max_length=None,
                               padding='max_length',
                               return_attention_mask=True,
                               return_tensors='tf')

# Extract the maximum sequence length from the training data
max_length = train_encoded_dict['input_ids'].shape[1]

# Tokenize and pad sequences for train data
train_input_ids = train_encoded_dict['input_ids'][:len(train_data)]
train_attention_masks = train_encoded_dict['attention_mask'][:len(train_data)]

# Convert labels to TensorFlow tensors
train_labels = tf.convert_to_tensor(train_data['class'].values)

# Tokenize and pad sequences for validation data
valid_encoded_dict = tokenizer(list(valid_data['tweet']),
                               add_special_tokens=True,
                               truncation=True,
                               max_length=max_length,
                               padding='max_length',
                               return_attention_mask=True,
                               return_tensors='tf')

# Tokenize and pad sequences for validation data
valid_input_ids = valid_encoded_dict['input_ids']
valid_attention_masks = valid_encoded_dict['attention_mask']

# Convert labels to TensorFlow tensors
valid_labels = tf.convert_to_tensor(valid_data['class'].values)

# Tokenize and pad sequences for test data
test_encoded_dict = tokenizer(list(test_data['tweet']),
                              add_special_tokens=True,
                              truncation=True,
                              max_length=max_length,
                              padding='max_length',
                              return_attention_mask=True,
                              return_tensors='tf')

# Tokenize and pad sequences for test data
test_input_ids = test_encoded_dict['input_ids']
test_attention_masks = test_encoded_dict['attention_mask']

# Convert labels to TensorFlow tensors
test_labels = tf.convert_to_tensor(test_data['class'].values)

# Display the shapes of the processed data
print("Train Input IDs shape:", train_input_ids.shape)
print("Train Attention Masks shape:", train_attention_masks.shape)
print("Train Labels shape:", train_labels.shape)

print("Validation Input IDs shape:", valid_input_ids.shape)
print("Validation Attention Masks shape:", valid_attention_masks.shape)
print("Validation Labels shape:", valid_labels.shape)

print("Test Input IDs shape:", test_input_ids.shape)
print("Test Attention Masks shape:", test_attention_masks.shape)
print("Test Labels shape:", test_labels.shape)

ModuleNotFoundError: No module named 'transformers'

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, GlobalMaxPooling1D, Conv1D, Bidirectional, GRU
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Define BERT input layer
input_ids = Input(shape=(train_encoded_dict['input_ids'].shape[1],), dtype=tf.int32, name='input_ids')
attention_masks = Input(shape=(train_encoded_dict['attention_mask'].shape[1],), dtype=tf.int32, name='attention_masks')

# BERT embedding layer
bert_embedding = bert_model(input_ids, attention_mask=attention_masks)[0]

# Channel 1: CNN
cnn_layer = Conv1D(128, 3, activation='relu')(bert_embedding)
cnn_layer = GlobalMaxPooling1D()(cnn_layer)
cnn_layer = Dropout(0.2)(cnn_layer)

# Channel 2: biGRU
bigru_layer = Bidirectional(GRU(64, return_sequences=True))(bert_embedding)
bigru_layer = GlobalMaxPooling1D()(bigru_layer)
bigru_layer = Dropout(0.2)(bigru_layer)

# Concatenate the outputs from both channels
concatenated_features = Concatenate()([cnn_layer, bigru_layer])

# Fully connected layer
dense_layer = Dense(128, activation='relu')(concatenated_features)
dense_layer = Dropout(0.2)(dense_layer)

# Output layer
output_layer = Dense(3, activation='softmax')(dense_layer)  # Assuming 3 classes for classification

# Build the model
model = Model(inputs=[input_ids, attention_masks], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 512)]                0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 512)]                0         []                            
 r)                                                                                               
                                                                                                  
 tf_bert_model (TFBertModel  multiple                     1094822   ['input_ids[0][0]',           
 )                                                        40         'attention_masks[0][0]']     
                                                                                            

In [None]:
import time
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Function to calculate F1 score
def calculate_f1(precision, recall):
    return 2 * (precision * recall) / (precision + recall + 1e-10)

# Train the model
start_time = time.time()
history = model.fit(
    x=[train_encoded_dict['input_ids'][:len(train_data)], train_encoded_dict['attention_mask'][:len(train_data)]],
    y=train_labels,
    validation_data=([valid_encoded_dict['input_ids'], valid_encoded_dict['attention_mask']], valid_labels),
    epochs=5,  # Adjust the number of epochs as needed
    batch_size=32  # Adjust the batch size as needed
)
training_time = time.time() - start_time

# Evaluate the model on the test set
start_time = time.time()
predictions = model.predict([test_encoded_dict['input_ids'], test_encoded_dict['attention_mask']])
inference_time = (time.time() - start_time) / len(test_data)

# Convert predictions to labels
predicted_labels = np.argmax(predictions, axis=1)

# Calculate evaluation metrics
accuracy = np.sum(predicted_labels == test_data['class'].values) / len(test_data)
classification_report_result = classification_report(test_data['class'].values, predicted_labels, target_names=label_encoder.classes_, output_dict=True)
confusion_matrix_result = confusion_matrix(test_data['class'].values, predicted_labels)

# Display evaluation metrics
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report_result)

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix_result)

# Display training and inference times
print("\nTraining Time:", training_time, "seconds")
print("Inference Time per Sample:", inference_time, "seconds")


Epoch 1/5


