<a href="https://colab.research.google.com/github/B-Sumanth/2203A51551_NLP/blob/main/2203A51551_NLP_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
# Load the dataset
vocab_size = 20000  # Consider top 20,000 words in the vocabulary
maxlen = 200  # Limit the sequence length to 200

# Load data from the IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

# Preprocess the input: pad the sequences to ensure uniform input size
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (25000, 200)
X_test shape: (25000, 200)


In [3]:
# Further split the training set into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Training data shape: (20000, 200)
Validation data shape: (5000, 200)


In [4]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Dropout

# Define the GRU model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen))
model.add(GRU(units=128, return_sequences=False))  # One GRU layer
model.add(Dropout(0.5))  # Regularization to prevent overfitting
model.add(Dense(units=1, activation='sigmoid'))  # Binary classification output

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [5]:
# Train the model
batch_size = 64
epochs = 5

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 574ms/step - accuracy: 0.6496 - loss: 0.5919 - val_accuracy: 0.8500 - val_loss: 0.3487
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 588ms/step - accuracy: 0.8991 - loss: 0.2621 - val_accuracy: 0.8718 - val_loss: 0.3143
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 583ms/step - accuracy: 0.9535 - loss: 0.1362 - val_accuracy: 0.8754 - val_loss: 0.3532
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 577ms/step - accuracy: 0.9755 - loss: 0.0753 - val_accuracy: 0.8740 - val_loss: 0.3757
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 611ms/step - accuracy: 0.9883 - loss: 0.0414 - val_accuracy: 0.8574 - val_loss: 0.5242


## Text generation would require an alternative dataset and architecture (sequence-to-sequence or character model)



In [6]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 65ms/step - accuracy: 0.8491 - loss: 0.5770
Test Accuracy: 84.80%
