In [2]:
# Cell 1: Initial Setup and Imports

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

import numpy as np
import pandas as pd
import re
import string
import pickle
import nltk

# Download NLTK data directly.
# Colab environments are often fresh, so direct download is usually best.
print("Downloading NLTK resources...")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # This one was problematic for you locally, ensure it's downloaded

# ... (rest of your Cell 1 code, like GPU verification) ...
# Verify GPU
print("TensorFlow Version:", tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        print("GPU is available and being used.")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Falling back to CPU. Ensure GPU runtime is selected.")

Downloading NLTK resources...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


TensorFlow Version: 2.18.0
1 Physical GPUs, 1 Logical GPUs
GPU is available and being used.


In [3]:
# Cell 2: Data Preprocessing Functions (from sentiment_analysis/data_preprocessing.py)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    """
    Applies a series of text preprocessing steps.
    """
    text = text.lower() # Lowercasing
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    # Ensure word_tokenize is correctly handling potential non-string inputs if any
    if not isinstance(text, str):
        text = str(text) # Convert to string if not already
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stopwords.words('english')]) # Remove stopwords and tokenize
    return text

def load_and_preprocess_imdb_dataset(num_words=10000, max_len=256):
    """
    Loads the IMDb dataset, preprocesses text, and prepares sequences.
    """
    # Load IMDb dataset directly from Keras
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=num_words)

    word_index = tf.keras.datasets.imdb.get_word_index()
    # Invert word_index to map integers back to words for preprocessing
    # Need to adjust for padding (0), start (1), unknown (2) tokens
    index_to_word = {value: key for key, value in word_index.items()}
    index_to_word[0] = "<pad>"
    index_to_word[1] = "<start>"
    index_to_word[2] = "<unk>" # Used for out-of-vocabulary words
    index_to_word[3] = "<unused>" # Original IMDb dataset mapping

    # Convert integer sequences back to text for preprocessing
    # Adjusting indices by -3 to match standard IMDb dataset token mapping
    train_texts = [" ".join([index_to_word.get(i, "?") for i in review]) for review in x_train]
    test_texts = [" ".join([index_to_word.get(i, "?") for i in review]) for review in x_test]

    print("Preprocessing training texts...")
    # Use a progress bar for long operations
    from tqdm.notebook import tqdm
    processed_train_texts = [preprocess_text(text) for text in tqdm(train_texts)]
    print("Preprocessing testing texts...")
    processed_test_texts = [preprocess_text(text) for text in tqdm(test_texts)]

    tokenizer = Tokenizer(num_words=num_words, oov_token="<unk>")
    tokenizer.fit_on_texts(processed_train_texts)

    train_sequences = tokenizer.texts_to_sequences(processed_train_texts)
    test_sequences = tokenizer.texts_to_sequences(processed_test_texts)

    train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

    return train_padded, y_train, test_padded, y_test, tokenizer, max_len

In [4]:
# Cell 3: Model Definition Functions (from sentiment_analysis/model.py)

def build_lstm_model(vocab_size, embedding_dim=100, max_len=256):
    """
    Builds and compiles an LSTM model for sentiment classification.
    """
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
        Bidirectional(LSTM(units=128, return_sequences=True)), # Using Bidirectional LSTM for better context
        Dropout(0.3),
        LSTM(units=64),
        Dropout(0.3),
        Dense(1, activation='sigmoid') # Sigmoid for binary classification
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
    """
    Trains the provided LSTM model.
    """
    print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val)
    )
    return history

def evaluate_model(model, X_test, y_test):
    """
    Evaluates the trained model on the test set.
    """
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"Test Loss: {loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")
    return loss, accuracy

In [6]:
# Cell 4: Model Training and Saving

# Configuration
VOCAB_SIZE = 10000
MAX_LEN = 256
EMBEDDING_DIM = 100
EPOCHS = 10
BATCH_SIZE = 64

# Define paths for saving model and tokenizer to Google Drive
# You'll need to mount Google Drive first (see next step)
MODEL_SAVE_PATH = '/content/gdrive/MyDrive/sentiment_lstm_model.h5'
TOKENIZER_SAVE_PATH = '/content/gdrive/MyDrive/tokenizer.pkl'

# Mount Google Drive (Run this cell first if you want to save/load from Drive)
from google.colab import drive
drive.mount('/content/gdrive')

print("Loading and preprocessing IMDb dataset...")
X_train, y_train, X_test, y_test, tokenizer, max_len = load_and_preprocess_imdb_dataset(
    num_words=VOCAB_SIZE, max_len=MAX_LEN
)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print("Building LSTM model...")
model = build_lstm_model(VOCAB_SIZE, EMBEDDING_DIM, max_len)
model.summary()

print("Training model...")
history = train_model(model, X_train, y_train, X_test, y_test, epochs=EPOCHS, batch_size=BATCH_SIZE)

print("Evaluating model...")
evaluate_model(model, X_test, y_test)

print(f"Saving model to {MODEL_SAVE_PATH}...")
model.save(MODEL_SAVE_PATH)

print(f"Saving tokenizer to {TOKENIZER_SAVE_PATH}...")
with open(TOKENIZER_SAVE_PATH, 'wb') as f:
    pickle.dump(tokenizer, f)

print("Training and saving complete.")

Mounted at /content/gdrive
Loading and preprocessing IMDb dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step
Preprocessing training texts...


  0%|          | 0/25000 [00:00<?, ?it/s]

Preprocessing testing texts...


  0%|          | 0/25000 [00:00<?, ?it/s]

X_train shape: (25000, 256)
y_train shape: (25000,)
X_test shape: (25000, 256)
y_test shape: (25000,)
Building LSTM model...




Training model...
Training on 25000 samples, validating on 25000 samples.
Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 66ms/step - accuracy: 0.4954 - loss: 0.6937 - val_accuracy: 0.5068 - val_loss: 0.6903
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 52ms/step - accuracy: 0.5296 - loss: 0.6832 - val_accuracy: 0.5172 - val_loss: 0.6890
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 64ms/step - accuracy: 0.5359 - loss: 0.6607 - val_accuracy: 0.7291 - val_loss: 0.5926
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 53ms/step - accuracy: 0.8025 - loss: 0.4583 - val_accuracy: 0.8209 - val_loss: 0.4058
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 53ms/step - accuracy: 0.9102 - loss: 0.2424 - val_accuracy: 0.8615 - val_loss: 0.3585
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 64ms/step - accuracy: 0.9477



Test Loss: 0.5944
Test Accuracy: 0.8406
Saving model to /content/gdrive/MyDrive/sentiment_lstm_model.h5...
Saving tokenizer to /content/gdrive/MyDrive/tokenizer.pkl...
Training and saving complete.


In [7]:
# Cell 5: Prediction Functions (from sentiment_analysis/predict.py)

try:
    model = load_model(MODEL_SAVE_PATH)
    with open(TOKENIZER_SAVE_PATH, 'rb') as f:
        tokenizer = pickle.load(f)
    print("Model and tokenizer loaded successfully from Google Drive.")
except Exception as e:
    print(f"Could not load model/tokenizer from Drive: {e}")
    print("Assuming model and tokenizer are still in memory from training.")
    # If not loaded from Drive, ensure 'model' and 'tokenizer' variables exist from Cell 4

MAX_SEQUENCE_LENGTH = MAX_LEN # Use the same max_len from training configuration

def predict_sentiment(text, model, tokenizer, max_len=MAX_SEQUENCE_LENGTH):
    """
    Predicts the sentiment of a given text.
    """
    if model is None or tokenizer is None:
        print("Model or tokenizer not loaded. Cannot predict.")
        return "Error: Model not loaded", 0.0

    # Preprocess the input text consistently
    processed_text = preprocess_text(text)

    # Convert text to sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    # Predict
    prediction = model.predict(padded_sequence)[0][0]

    # Interpret prediction
    if prediction >= 0.5:
        return "Positive", prediction
    else:
        return "Negative", prediction



Model and tokenizer loaded successfully from Google Drive.


In [8]:
# Cell 6: Test Predictions

print("\n--- Sentiment Prediction Examples ---")

test_reviews = [
    "This movie was an absolute masterpiece! I loved every single moment.",
    "Utterly boring and a complete waste of my time. Don't bother watching.",
    "It had some good moments, but overall it was just okay.",
    "The acting was superb, but the plot was a bit confusing.",
    "Worst film of the year. Avoid at all costs."
]

for review in test_reviews:
    sentiment, probability = predict_sentiment(review, model, tokenizer)
    print(f"Review: \"{review}\"")
    print(f"Predicted Sentiment: {sentiment} (Probability: {probability:.4f})\n")


--- Sentiment Prediction Examples ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 277ms/step
Review: "This movie was an absolute masterpiece! I loved every single moment."
Predicted Sentiment: Positive (Probability: 0.9017)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Review: "Utterly boring and a complete waste of my time. Don't bother watching."
Predicted Sentiment: Positive (Probability: 0.9800)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Review: "It had some good moments, but overall it was just okay."
Predicted Sentiment: Positive (Probability: 0.9898)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Review: "The acting was superb, but the plot was a bit confusing."
Predicted Sentiment: Negative (Probability: 0.4797)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Review: "Worst film of the year. Avoid at all costs."
Predicted Sentiment: Positive (Probabilit