In [None]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from nltk.tokenize import word_tokenize
import nltk

# Fix random seed for reproducibility
np.random.seed(7)


In [None]:
# Load the dataset with a reasonable vocabulary size
top_words = 10000  # Adjust as needed
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
# Inspect a sample review and its label
print('---review---')
print(X_train[6])
print('---label---')
print(y_train[6])

# Get word-to-index and index-to-word dictionaries
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}

# Print a review with words
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

# Determine maximum and minimum review lengths
max_review_length = max(len(seq) for seq in (X_train + X_test))
min_review_length = min(len(seq) for seq in (X_train + X_test))

print('Maximum review length:', max_review_length)
print('Minimum review length:', min_review_length)

# Pad sequences to a fixed length
X_train = pad_sequences(X_train, maxlen=max_review_length)
X_test = pad_sequences(X_test, maxlen=max_review_length)


---review---
[1, 6740, 365, 1234, 5, 1156, 354, 11, 14, 5327, 6638, 7, 1016, 2, 5940, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 2, 9363, 1117, 1831, 7485, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 2, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 2, 8564, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 7175, 180, 6, 227, 11, 94, 2494, 2, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 5390, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
---label---
1
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
---review with words---
['the', 'boiled', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'murdering', 'naschy', 'br', 'villain', 'and', 'suggestion', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'and', 'concentrates', 'concept', 'issue', 'skeptical', 'to', "god's", 'he'

In [None]:
# Create the model architecture
embedding_vec_length = 128  # Adjust as needed
model = Sequential([
    Embedding(top_words, embedding_vec_length, input_length=max_review_length),
    Dropout(0.2),
    LSTM(128),  # Adjust units as needed
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2697, 128)         1280000   
                                                                 
 dropout (Dropout)           (None, 2697, 128)         0         
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1411713 (5.39 MB)
Trainable params: 1411713 (5.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/20
Epoch 2

<keras.src.callbacks.History at 0x7cc66191cd00>

In [None]:
# Evaluate the model's accuracy
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))



Accuracy: 85.22%


In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Save the model
model.save('/content/drive/MyDrive/Senti/sentiment_analysis_model_new.h5')
print("Saved model to disk")

Saved model to disk


  saving_api.save_model(


In [None]:
# Load the model
model = load_model('/content/drive/MyDrive/Senti/sentiment_analysis_model_new.h5')
# print("Model Loaded")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2697, 128)         1280000   
                                                                 
 dropout (Dropout)           (None, 2697, 128)         0         
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1411713 (5.39 MB)
Trainable params: 1411713 (5.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
def preprocess_text(text):
  # Lowercase the text
  text = text.lower()

  # Remove punctuation (optional)
  # text = re.sub(r'[^\w\s]', '', text)

  # Tokenize the text (optional)
  # words = text.split()

  # ... (add more preprocessing steps if needed)

  return text


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_sentiment(text):
  processed_text = preprocess_text(text)

  # Assuming your model expects a sequence of integers representing word IDs
  # Load the vocabulary used during training (if necessary)
  word_to_id = imdb.get_word_index()  # Assuming vocabulary was saved

  # Convert the processed text into a list of word IDs
  words = processed_text.split()
  x_test = [[word_to_id[word] for word in words if word in word_to_id]]

  # Pad the sequence to a fixed length (if needed)
  max_review_length = 2697  # Adjust as needed based on your model
  x_test = pad_sequences(x_test, maxlen=max_review_length)

  # Prepare the input for the model (assuming a single sample)
  prediction = model.predict(np.array([x_test[0]]))[0][0]  # Access the first element

  if prediction > 0.5:
    return "Positive"
  else:
    return "Negative"


# Example usage
text_to_review = "This movie was fantastic!"
sentiment = predict_sentiment(text_to_review)
print(f"Sentiment for '{text_to_review}': {sentiment}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
Sentiment for 'This movie was fantastic!': Negative
