In [16]:
# pip install tensorflow==2.15.0

In [17]:
# pip install transformers==4.35.0

In [None]:
import transformers
print(transformers.__version__)

4.44.2


In [None]:
import tensorflow
print(tensorflow.__version__)

2.18.0


In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer

In [21]:
df = pd.read_csv('../Data/Dataset_2.csv')

In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [23]:
df['class'].unique()

array([2, 1, 0], dtype=int64)

In [24]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               100 non-null    int64 
 1   hate_speech         100 non-null    int64 
 2   offensive_language  100 non-null    int64 
 3   neither             100 non-null    int64 
 4   class               100 non-null    int64 
 5   tweet               100 non-null    object
dtypes: int64(5), object(1)
memory usage: 4.8+ KB


In [None]:
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"&amp;", "&", text) 
    text = re.sub(r"[^\w\s]", "", text) 
    text = text.strip()  
    return text.lower()

df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

In [27]:
df['label'] = df['class']

In [None]:
X = df['cleaned_tweet']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(texts, labels):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128)
    return encodings, labels

train_encodings, train_labels = tokenize_data(X_train, y_train)
val_encodings, val_labels = tokenize_data(X_val, y_val)
test_encodings, test_labels = tokenize_data(X_test, y_test)



In [None]:
def create_tf_dataset(encodings, labels, batch_size=16):
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))

    dataset = dataset.shuffle(len(labels)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = create_tf_dataset(train_encodings, train_labels)
val_dataset = create_tf_dataset(val_encodings, val_labels)
test_dataset = create_tf_dataset(test_encodings, test_labels)

In [31]:
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [57]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [58]:
early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=3,          
    restore_best_weights=True
)

In [59]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  
    patience=3,          
    restore_best_weights=True
)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20, 
    callbacks=[early_stopping]
)

AttributeError: 'EarlyStopping' object has no attribute '_implements_train_batch_hooks'

In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.90498286485672


In [56]:
example_statements = [
    "You should go to hell.",  # Hate speech
    "What an idiot you are!",  # Offensive language
    "Have a nice day!",  # Neutral/Non-offensive
]

inputs = tokenizer(example_statements, padding=True, truncation=True, return_tensors="tf")
predictions = model(inputs).logits
predicted_classes = tf.argmax(predictions, axis=1)

for statement, pred_class in zip(example_statements, predicted_classes.numpy()):
    print(f"Statement: {statement}")
    print(f"Predicted class: {pred_class}")
    print()

Statement: You should go to hell.
Predicted class: 1

Statement: What an idiot you are!
Predicted class: 1

Statement: Have a nice day!
Predicted class: 1

