In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematical operations
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split  # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences  # for padding
from tensorflow.keras.models import Sequential, load_model  # for building and loading models
from tensorflow.keras.layers import Embedding, LSTM, Dense  # for defining layers
from tensorflow.keras.callbacks import ModelCheckpoint  # to save the model during training
import nltk
import re
import os

# Download NLTK stopwords
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

# Load the IMDB dataset
def load_dataset():
    # Read the CSV file
    df = pd.read_csv('/content/IMDB Dataset.csv')  # Adjust path if not on Colab

    # Ensure the dataset has the correct columns
    if 'review' not in df.columns or 'sentiment' not in df.columns:
        raise ValueError("The dataset must contain 'review' and 'sentiment' columns.")

    # Extract reviews and sentiments
    x_data = df['review']  # This will be a Pandas Series
    y_data = df['sentiment']  # This will be a Pandas Series

    # Preprocess reviews
    x_data = x_data.str.replace(r'<.*?>', '', regex=True)  # Remove HTML tags
    x_data = x_data.str.replace(r'[^A-Za-z\s]', '', regex=True)  # Remove non-alphabet characters
    x_data = x_data.str.lower()  # Convert to lowercase
    x_data = x_data.apply(lambda review: ' '.join([w for w in review.split() if w not in english_stops]))  # Remove stopwords

    # Encode sentiments: 'positive' -> 1, 'negative' -> 0
    y_data = y_data.replace({'positive': 1, 'negative': 0})

    return x_data, y_data

# Load and preprocess data
x_data, y_data = load_dataset()

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Function to compute the maximum sequence length
def get_max_length(data):
    # Ensure each review is a string
    if isinstance(data, pd.Series):
        review_lengths = data.apply(lambda x: len(str(x).split()))  # Convert each review to string before split
    else:
        # Handle the case when data is a list, using list comprehension
        review_lengths = [len(str(x).split()) for x in data]

    return int(np.ceil(np.mean(review_lengths)))

# Now call this function using x_train (which should be a Pandas Series)
max_length = get_max_length(x_train)

# Encode reviews using a tokenizer
tokenizer = Tokenizer(num_words=5000, lower=True)  # Use only top 5000 words
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

# Pad sequences to ensure consistent input length
max_length = get_max_length(x_train)
x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

# Get vocabulary size for the embedding layer
total_words = len(tokenizer.word_index) + 1  # Include padding token

# Define the model
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=EMBED_DIM, input_length=max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display model summary
print(model.summary())

# Ensure the 'models' directory exists
os.makedirs('models', exist_ok=True)

# Define a checkpoint callback to save the best model
checkpoint = ModelCheckpoint(
    filepath='models/LSTM.keras',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Train the model
model.fit(
    x_train,
    y_train,
    validation_split=0.1,  # Use 10% of training data for validation
    batch_size=128,
    epochs=5,
    callbacks=[checkpoint]
)

# Evaluate the model
y_pred = (model.predict(x_test, batch_size=128) > 0.5).astype("int32")

# Calculate accuracy
correct_predictions = np.sum(y_pred.flatten() == y_test.values)
accuracy = correct_predictions / len(y_test) * 100

print(f'Correct Predictions: {correct_predictions}')
print(f'Wrong Predictions: {len(y_test) - correct_predictions}')
print(f'Accuracy: {accuracy:.2f}%')

# Load the best saved model
loaded_model = load_model('models/LSTM.keras')

# Function for predicting sentiment of a new review
def predict_sentiment(review):
    # Pre-process input
    review = re.sub(r'<.*?>', '', review)  # Remove HTML tags
    review = re.sub(r'[^a-zA-Z\s]', '', review)  # Remove non-alphabet characters
    review = ' '.join([w for w in review.split() if w.lower() not in english_stops])  # Remove stopwords
    review = review.lower()  # Convert to lowercase

    print('Cleaned Review:', review)

    # Tokenize and pad the review
    tokenize_words = tokenizer.texts_to_sequences([review])
    tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')

    # Predict sentiment
    result = loaded_model.predict(tokenize_words)[0][0]
    print('Prediction Probability:', result)

    if result >= 0.5:
        print('Sentiment: Positive')
    else:
        print('Sentiment: Negative')

# Test the function
review = input('Enter a movie review: ')
predict_sentiment(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  y_data = y_data.replace({'positive': 1, 'negative': 0})


None
Epoch 1/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - accuracy: 0.6145 - loss: 0.6166
Epoch 1: val_accuracy improved from -inf to 0.81275, saving model to models/LSTM.keras
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 200ms/step - accuracy: 0.6149 - loss: 0.6162 - val_accuracy: 0.8127 - val_loss: 0.4227
Epoch 2/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step - accuracy: 0.8751 - loss: 0.3138
Epoch 2: val_accuracy improved from 0.81275 to 0.86650, saving model to models/LSTM.keras
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 208ms/step - accuracy: 0.8751 - loss: 0.3138 - val_accuracy: 0.8665 - val_loss: 0.3418
Epoch 3/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step - accuracy: 0.8931 - loss: 0.2758
Epoch 3: val_accuracy improved from 0.86650 to 0.87250, saving model to models/LSTM.keras
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [