In [None]:
# pip install tensorflow==2.15.0

In [None]:
# pip install transformers==4.35.0

In [1]:
import transformers
print(transformers.__version__)

4.35.0


  _torch_pytree._register_pytree_node(


In [2]:
import tensorflow
print(tensorflow.__version__)

2.15.0


In [3]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer

  _torch_pytree._register_pytree_node(


In [4]:
df = pd.read_csv('/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv')

In [32]:
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,cleaned_tweet,label
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,rt mayasolovely as a woman you shouldnt compla...,2
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt mleew17 boy dats coldtyga dwn bad for cuffi...,1
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt urkindofbrand dawg rt 80sbaby4life you ever...,1
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt c_g_anderson viva_based she look like a tranny,1
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shenikaroberts the shit you hear about me m...,1


In [34]:
df['class'].unique()

array([2, 1, 0])

In [31]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [7]:
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"&amp;", "&", text) 
    text = re.sub(r"[^\w\s]", "", text) 
    text = text.strip()  
    return text.lower()

df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

In [35]:
df['label'] = df['class']

In [36]:
X = df['cleaned_tweet']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [37]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(texts, labels):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128)
    return encodings, labels

train_encodings, train_labels = tokenize_data(X_train, y_train)
val_encodings, val_labels = tokenize_data(X_val, y_val)
test_encodings, test_labels = tokenize_data(X_test, y_test)

In [38]:
def create_tf_dataset(encodings, labels, batch_size=16):
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))

    dataset = dataset.shuffle(len(labels)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = create_tf_dataset(train_encodings, train_labels)
val_dataset = create_tf_dataset(val_encodings, val_labels)
test_dataset = create_tf_dataset(test_encodings, test_labels)

In [39]:
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [41]:
optimizer = Adam(learning_rate=5e-5)
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [42]:
early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=3,          
    restore_best_weights=True
)

In [43]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20, 
    callbacks=[early_stopping]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [44]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.90498286485672


In [48]:
example_statements = [
    "You should go to hell.",  # Hate speech
    "What an idiot you are!",  # Offensive language
    "Have a nice day!",  # Neutral/Non-offensive
]

inputs = tokenizer(example_statements, padding=True, truncation=True, return_tensors="tf")
predictions = model(inputs).logits
predicted_classes = tf.argmax(predictions, axis=1)

for statement, pred_class in zip(example_statements, predicted_classes.numpy()):
    print(f"Statement: {statement}")
    print(f"Predicted class: {pred_class}")
    print()

Statement: You should go to hell.
Predicted class: 1

Statement: What an idiot you are!
Predicted class: 1

Statement: Have a nice day!
Predicted class: 2

