In [1]:
!pip install datasets 'tensorflow==2.15'



In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, TextVectorization, GRU, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import Word2Vec
from datasets import load_dataset
import numpy as np

# Enable CuDNN for GRU if a GPU is available
if tf.config.list_physical_devices('GPU'):
    GRU_LAYER = lambda units, return_sequences=False: tf.keras.layers.GRU(
        units, return_sequences=return_sequences, recurrent_activation='sigmoid')
else:
    GRU_LAYER = lambda units, return_sequences=False: tf.keras.layers.GRU(
        units, return_sequences=return_sequences, dropout=0.15, recurrent_dropout=0.5)

# Load the IMDb dataset from Hugging Face
dataset = load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:

# Preprocess the dataset
max_vocab_size = 25000  # Limit vocabulary size
max_seq_len = 50       # Maximum sequence length
embedding_dim = 150
latent_dim = 512  # Increased latent dimension for GRU
output_dim = 2

# Train Word2Vec embeddings using gensim
sentences = [text.split() for text in dataset['train']['text']]
word2vec_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# Create embedding matrix
def create_embedding_matrix(word_index, word2vec_model, embedding_dim):
    embedding_matrix = np.zeros((len(word_index), embedding_dim))
    for word, i in word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    return embedding_matrix

# TextVectorization layer for preprocessing
vectorizer = TextVectorization(
    max_tokens=max_vocab_size,
    output_mode='int',
    output_sequence_length=max_seq_len
)

# Adapt the TextVectorization layer to the training data
vectorizer.adapt(dataset['train']['text'])

# Get the vocabulary and create the embedding matrix
vocab = vectorizer.get_vocabulary()
word_index = {word: idx for idx, word in enumerate(vocab)}
embedding_matrix = create_embedding_matrix(word_index, word2vec_model, embedding_dim)

In [None]:



# Create TensorFlow datasets
train_texts = tf.convert_to_tensor(dataset['train']['text'])
train_labels = tf.convert_to_tensor(dataset['train']['label'])
test_texts = tf.convert_to_tensor(dataset['test']['text'])
test_labels = tf.convert_to_tensor(dataset['test']['label'])

# Vectorize and preprocess text data
def preprocess_texts(text, label):
    text = vectorizer(text)
    label = tf.one_hot(label, depth=2)
    return text, label

train_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
train_ds = train_ds.map(preprocess_texts).shuffle(10000).batch(256).prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))
test_ds = test_ds.map(preprocess_texts).batch(256).prefetch(tf.data.AUTOTUNE)


In [None]:

# Encoder-only architecture
class SentimentClassifier(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, latent_dim, output_dim, embedding_matrix):
        super(SentimentClassifier, self).__init__()
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False, mask_zero=True)
        self.dropout = Dropout(0.25)
        self.gru = GRU_LAYER(latent_dim, return_sequences=False)
        self.dense = tf.keras.layers.Dense(output_dim, activation="softmax")

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.dropout(x)
        x = self.gru(x)
        outputs = self.dense(x)
        return outputs

# Instantiate the model
sentiment_model = SentimentClassifier(len(vocab), embedding_dim, latent_dim, output_dim, embedding_matrix)

# Build the model
sentiment_model.build(input_shape=(None, max_seq_len))

# Compile the model
sentiment_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
sentiment_model.summary()


In [None]:

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
sentiment_model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=20,
    batch_size=256,
    callbacks=[early_stopping]
)


In [None]:

# Manual testing
sample_positive = "This was the best movie I have ever seen."
sample_negative = "This was the worst movie I have ever watched."
sample_neutral = "The movie was okay, not great but not terrible."
sample_sarcasm = "Wow, this was such a masterpiece... the actors, the screenplay, I could stay for hours if it wasn't for how bad it was."
sample_irony = "The plot was so riveting, I couldn’t stop yawning."

# Preprocess the samples
sample_positive_vectorized = vectorizer(tf.convert_to_tensor([sample_positive]))
sample_negative_vectorized = vectorizer(tf.convert_to_tensor([sample_negative]))
sample_neutral_vectorized = vectorizer(tf.convert_to_tensor([sample_neutral]))
sample_sarcasm_vectorized = vectorizer(tf.convert_to_tensor([sample_sarcasm]))
sample_irony_vectorized = vectorizer(tf.convert_to_tensor([sample_irony]))

# Predict sentiment
positive_prediction = sentiment_model.predict(sample_positive_vectorized)
negative_prediction = sentiment_model.predict(sample_negative_vectorized)
neutral_prediction = sentiment_model.predict(sample_neutral_vectorized)
sarcasm_prediction = sentiment_model.predict(sample_sarcasm_vectorized)
irony_prediction = sentiment_model.predict(sample_irony_vectorized)

print("Positive Prediction:", positive_prediction)
print("Negative Prediction:", negative_prediction)
print("Neutral Prediction:", neutral_prediction)
print("Sarcasm Prediction:", sarcasm_prediction)
print("Irony Prediction:", irony_prediction)
