# Import Libs & Loading DataSet

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

In [27]:
df = pd.read_csv('/content/Sarcasm.csv', usecols= ['tweet', 'sarcastic'])
df.sample(5)

Unnamed: 0,tweet,sarcastic
401,If only people would care as much about povert...,1
2738,"hey twitter, how do you twitter now a days.",0
3107,Xmas shopping all done thank god 😅,0
3297,I am so ridiculously proud of the job I do!💜,0
3231,Hannah Montana pandora station is so underrated,0


# EDA

In [28]:
df.isnull().sum()

Unnamed: 0,0
tweet,1
sarcastic,0


In [29]:
df['sarcastic'].value_counts()

Unnamed: 0_level_0,count
sarcastic,Unnamed: 1_level_1
0,2601
1,867


# Using Transformer Model

In [30]:
model_name = 'distilbert-base-uncased'
max_length = 128
num_epochs = 10
batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize Func

In [31]:
def tokenize_data(text_list, max_len):
    return tokenizer(
        text_list,
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='tf'
    )

X_tokenize = tokenize_data(df['tweet'].astype(str).tolist(), max_length) # returns tweets in a list of python strings.

y = df['sarcastic'].values

# Data Splitting & Train_test_split

In [32]:
from sklearn.model_selection import train_test_split

input_ids = np.array(X_tokenize['input_ids'])
attention_masks = np.array(X_tokenize['attention_mask'])

x_train_ids, x_val_ids, x_train_mask, x_val_mask, y_train, y_val = train_test_split(
    input_ids,
    attention_masks,
    y,
    test_size=0.2,
    random_state=42
)

# Train & Val DataSet Prepration

In [33]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    x_train_ids, y_train
)).shuffle(100).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [34]:
val_dataset = tf.data.Dataset.from_tensor_slices((
    x_val_ids, y_val
)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [35]:
# !pip install transformers==4.49.0 tensorflow==2.20.0 tf-keras

In [36]:
# Load the pre-trained DistilBERT model for sequence classification
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

# Model Compilation & Evaluation

In [37]:
import tf_keras

optimizer = tf_keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

In [38]:
# Define Early Stopping callback
from tf_keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    train_dataset,
    epochs=num_epochs,
    validation_data=val_dataset,
    callbacks=[early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


# Sarcasm Detection Func

In [39]:
def sarcasm_detection(sentence, model, tokenizer):
    input_sentence = tokenizer(
        sentence, truncation=True, padding='max_length', max_length=max_length, return_tensors='tf'
    )

    preds = model(input_sentence)[0]
    probs = tf.nn.softmax(preds, axis= -1).numpy() # numpy is use to convert tf into numpy.
    pred_ids = np.argmax(probs, axis=-1)

    if pred_ids[0] == 0:
        return "Sarcastic"
    else:
        return "Not Sarcastic"

# Test Run

In [40]:
test1 = "Oh great, another Monday! I just love waking up early after the weekend."
print(sarcasm_detection(test1, model, tokenizer))

Sarcastic


# Saving Model

In [42]:
import pickle
pickle.dump(model, open('bert_model.pkl', 'wb'))