In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# Assuming your DataFrame is named df
# ...
df1 = pd.read_csv("/content/hate_speech_dataset.csv")
df = df1.sample(n=2000, random_state=40)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


# Function for text cleaning and preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ''

    # Remove special characters, links, etc.
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [word.lower() for word in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Apply the preprocessing function to the entire 'tweet' column
train_df['processed_tweet'] = train_df['tweet'].apply(preprocess_text)

# Display the preprocessed DataFrame
print(train_df[['class', 'processed_tweet']])




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


       class                                    processed_tweet
18831      1  rt dakwhi kaewhy270 peopl forget logic behind ...
1132       1  8220beardedgemini 8220ginasanabria hoe yah nig...
14616      1  rt causewereguy your littl bitch httptco92flic...
848        2  mt commiss gouach tiki wahin monkey tube surf ...
11497      1  ever saw kendal jone person id kill sight hate...
...      ...                                                ...
6957       1  salome110thebe1 what good bitch ass nigga catc...
19067      2  rt grind2tim dont think even real relationship...
9948       1                     how bitch wear high heal right
2558       1          barbod6fcb u lame bitch get fuck gay shit
5216       2                            yellabeautykc home coon

[1600 rows x 2 columns]


BERT embeddings

In [None]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize text
    tokens = tokenizer.encode(text, max_length=128, truncation=True, padding='max_length', return_tensors='tf')

    # Get BERT embeddings
    embeddings = bert_model(tokens)['last_hidden_state']

    return embeddings

# Apply the function to the entire 'processed_tweet' column
train_df['bert_embeddings'] = train_df['processed_tweet'].apply(get_bert_embeddings)

# Display the DataFrame with 'class', 'processed_tweet', and 'bert_embeddings' columns
print(train_df[['class', 'processed_tweet', 'bert_embeddings']])


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

       class                                    processed_tweet  \
18831      1  rt dakwhi kaewhy270 peopl forget logic behind ...   
1132       1  8220beardedgemini 8220ginasanabria hoe yah nig...   
14616      1  rt causewereguy your littl bitch httptco92flic...   
848        2  mt commiss gouach tiki wahin monkey tube surf ...   
11497      1  ever saw kendal jone person id kill sight hate...   
...      ...                                                ...   
6957       1  salome110thebe1 what good bitch ass nigga catc...   
19067      2  rt grind2tim dont think even real relationship...   
9948       1                     how bitch wear high heal right   
2558       1          barbod6fcb u lame bitch get fuck gay shit   
5216       2                            yellabeautykc home coon   

                                         bert_embeddings  
18831  (((tf.Tensor(-0.34693128, shape=(), dtype=floa...  
1132   (((tf.Tensor(-0.37257406, shape=(), dtype=floa...  
14616  (((tf.Tenso

Passing through channels

In [None]:
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, GRU, Concatenate, Dense
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np

# Assuming 'bert_embeddings' contains the BERT embeddings in train_df
bert_embeddings_shape = train_df['bert_embeddings'].iloc[0].shape

# Create Input layer for BERT embeddings
bert_input = Input(shape=bert_embeddings_shape[1:], name="bert_input")

# BERT+CNN
conv_output = Conv1D(filters=64, kernel_size=3, activation='relu')(bert_input)
cnn_output = GlobalMaxPooling1D()(conv_output)

# BERT+biGRU
bi_gru_output = Bidirectional(GRU(32, return_sequences=True))(bert_input)
gru_output = GlobalMaxPooling1D()(bi_gru_output)

# Concatenate outputs
concatenated_features = Concatenate()([cnn_output, gru_output])

# Fully Connected Layer and Softmax
fc_layer = Dense(units=128, activation='relu')(concatenated_features)
output_layer = Dense(units=3, activation='softmax')(fc_layer)

# Create the model
model = Model(inputs=bert_input, outputs=output_layer)

# Display the model summary
model.summary()

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Assuming 'bert_embeddings' is a numpy array containing the BERT embeddings
bert_embeddings_array = np.stack(train_df['bert_embeddings'].values)[:, 0, :, :]

# Train the model
model.fit(
    x=bert_embeddings_array,
    y=train_df['class'],
    epochs=5,
    batch_size=32,
    validation_split=0.2
)




Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 bert_input (InputLayer)     [(None, 128, 768)]           0         []                            
                                                                                                  
 conv1d_4 (Conv1D)           (None, 126, 64)              147520    ['bert_input[0][0]']          
                                                                                                  
 bidirectional_4 (Bidirecti  (None, 128, 64)              153984    ['bert_input[0][0]']          
 onal)                                                                                            
                                                                                                  
 global_max_pooling1d_8 (Gl  (None, 64)                   0         ['conv1d_4[0][0]']      

<keras.src.callbacks.History at 0x79778ff09540>

test preprocess

In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# Assuming your DataFrame is named df
# ...
df1 = pd.read_csv("/content/hate_speech_dataset.csv")
df = df1.sample(n=2000, random_state=40)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


# Function for text cleaning and preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ''

    # Remove special characters, links, etc.
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [word.lower() for word in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Apply the preprocessing function to the entire 'tweet' column
test_df['processed_tweet'] = test_df['tweet'].apply(preprocess_text)

# Display the preprocessed DataFrame
print(test_df[['class', 'processed_tweet']])




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


       class                                    processed_tweet
24708      1                                   u sell dat pussi
9191       1                                 free hoe 1x 128520
5841       1                 denythepropheci that dragon retard
1532       1  8220stevestfl relationship end life stop whini...
22468      1  walk parti playin nelli air forc one walk outt...
...      ...                                                ...
20479      1                               readi bounc di bitch
12666      1       lo bitch told name start could fucka b c got
1202       1  8220condeezy3 gamehom alon noth dosomeon come ...
21126      1                   spell name right bitch well talk
13208      1            naw bitch goin monster lol lyric maniac

[400 rows x 2 columns]


test embedding

In [None]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize text
    tokens = tokenizer.encode(text, max_length=128, truncation=True, padding='max_length', return_tensors='tf')

    # Get BERT embeddings
    embeddings = bert_model(tokens)['last_hidden_state']

    return embeddings

# Apply the function to the entire 'processed_tweet' column
test_df['bert_embeddings'] = test_df['processed_tweet'].apply(get_bert_embeddings)

# Display the DataFrame with 'class', 'processed_tweet', and 'bert_embeddings' columns
print(test_df[['class', 'processed_tweet', 'bert_embeddings']])


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Assuming 'bert_embeddings_test' is a numpy array containing the BERT embeddings for test_df
bert_embeddings_array_test = np.stack(test_df['bert_embeddings'].values)[:, 0, :, :]

# Predict class probabilities for each sample
predictions = model.predict(bert_embeddings_array_test)

# Get the predicted class for each sample
predicted_classes = np.argmax(predictions, axis=1)

# Calculate precision, recall, and f1 score for each class
precision_scores = precision_score(test_df['class'], predicted_classes, average=None)
recall_scores = recall_score(test_df['class'], predicted_classes, average=None)
f1_scores = f1_score(test_df['class'], predicted_classes, average=None)

# Display precision, recall, and f1 score for each class
for class_label, precision, recall, f1 in zip(range(3), precision_scores, recall_scores, f1_scores):
    print(f"Class {class_label}: Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1:.4f}")

# Alternatively, you can use classification_report for a comprehensive report
print("\nClassification Report:")
print(classification_report(test_df['class'], predicted_classes))
