In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, LayerNormalization, MultiHeadAttention, Add, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical

In [None]:
# Importing Dataset
df = pd.read_csv("/content/unique_sarcasm_dataset.csv")

In [None]:
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    return text

In [None]:
df['text'] = df['text'].apply(preprocess_text)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.1, random_state=42)

In [None]:
# Tokenization parameters
vocab_size = 10000
max_length = 32
embedding_dim = 128

In [None]:
# tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
# tokenizer.fit_on_texts(train_texts)

In [None]:
import pickle

# Load the tokenizer
with open('tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

In [None]:
# Tokenization and Padding
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
# class TransformerBlock(tf.keras.layers.Layer):
#     def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
#         super(TransformerBlock, self).__init__()
#         self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.ffn = Sequential([
#             Dense(ff_dim, activation="relu"),
#             Dense(embed_dim)
#         ])
#         self.layernorm1 = LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = LayerNormalization(epsilon=1e-6)
#         self.dropout1 = Dropout(rate)
#         self.dropout2 = Dropout(rate)

#     def call(self, inputs, training=False):
#         attn_output = self.att(inputs, inputs)
#         attn_output = self.dropout1(attn_output, training=training)
#         out1 = self.layernorm1(inputs + attn_output)
#         ffn_output = self.ffn(out1)
#         ffn_output = self.dropout2(ffn_output, training=training)
#         return self.layernorm2(out1 + ffn_output)

In [None]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Model parameters
embedding_dim = 128
lstm_units = 128  # LSTM units

# Embedded Layer
inputs = Input(shape=(max_length,))
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(inputs)

# CNN Layer
x = Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
x = MaxPooling1D(pool_size=2)(x)

# LSTM Layer (output size is 128)
x = Bidirectional(LSTM(lstm_units, return_sequences=False))(x)

# Dense layer
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)

# Output layer for binary classification (sarcastic or not)
outputs = Dense(1, activation='sigmoid')(x)

# Build and compile the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
history = model.fit(train_padded, train_labels, epochs=3, batch_size=1, validation_data=(test_padded, test_labels))

Epoch 1/3
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - accuracy: 0.5150 - loss: 0.6575 - val_accuracy: 1.0000 - val_loss: 0.0098
Epoch 2/3
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.9831 - loss: 0.0450 - val_accuracy: 1.0000 - val_loss: 0.0191
Epoch 3/3
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 1.0000 - loss: 0.0037 - val_accuracy: 0.9500 - val_loss: 0.1834


In [None]:
def predict_sarcasm(sentences):
    # Preprocess and tokenize the input sentences
    sentences = [preprocess_text(sentence) for sentence in sentences]
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

    # Predicting sarcasm
    predictions = model.predict(padded)
    return predictions
    return ["Sarcastic" if pred > 0.5 else "Not Sarcastic" for pred in predictions]

# Example usage
new_sentences = ["this video is fantastic and enjoyable", "Oh fanastic another Monday doing this. Just what I wanted to do"]
predictions = predict_sarcasm(new_sentences)
print(predictions)

NameError: name 'preprocess_text' is not defined

Storing Models Params

In [None]:
# Saving the model as an .h5 file
model.save('new_model_sarcasm.h5')

