In [None]:
import nltk
import numpy as np
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [None]:
try:
  nltk.data.find('corpora/movie_reviews')
except LookupError:
  nltk.download('movie_reviews')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [None]:
movie_reviews

<CategorizedPlaintextCorpusReader in '/root/nltk_data/corpora/movie_reviews'>

In [None]:
documents=[(list(movie_reviews.words(fileid)),category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]

In [None]:
documents[2]

(['it',
  'is',
  'movies',
  'like',
  'these',
  'that',
  'make',
  'a',
  'jaded',
  'movie',
  'viewer',
  'thankful',
  'for',
  'the',
  'invention',
  'of',
  'the',
  'timex',
  'indiglo',
  'watch',
  '.',
  'based',
  'on',
  'the',
  'late',
  '1960',
  "'",
  's',
  'television',
  'show',
  'by',
  'the',
  'same',
  'name',
  ',',
  'the',
  'mod',
  'squad',
  'tells',
  'the',
  'tale',
  'of',
  'three',
  'reformed',
  'criminals',
  'under',
  'the',
  'employ',
  'of',
  'the',
  'police',
  'to',
  'go',
  'undercover',
  '.',
  'however',
  ',',
  'things',
  'go',
  'wrong',
  'as',
  'evidence',
  'gets',
  'stolen',
  'and',
  'they',
  'are',
  'immediately',
  'under',
  'suspicion',
  '.',
  'of',
  'course',
  ',',
  'the',
  'ads',
  'make',
  'it',
  'seem',
  'like',
  'so',
  'much',
  'more',
  '.',
  'quick',
  'cuts',
  ',',
  'cool',
  'music',
  ',',
  'claire',
  'dane',
  "'",
  's',
  'nice',
  'hair',
  'and',
  'cute',
  'outfits',
  ',',
  '

In [None]:
import random
random.shuffle(documents)

In [None]:
texts=[" ".join(words) for words, category in documents]
labels=[1 if category == 'pos' else 0 for words, category in documents]

In [None]:
print(texts)

In [None]:
print(labels)

[1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
max_words=10000
maxlen=256

tokenizer=Tokenizer(num_words=max_words, oov_token="<oov>")
tokenizer.fit_on_texts(x_train)

x_train_sequences= tokenizer.texts_to_sequences(x_train)
x_test_sequences= tokenizer.texts_to_sequences(x_test)

x_train_padded= pad_sequences(x_train_sequences, maxlen=maxlen, padding='post', truncating='post')
x_test_padded= pad_sequences(x_test_sequences, maxlen=maxlen, padding='post', truncating='post')

y_train= np.array(y_train)
y_test= np.array(y_test)

In [None]:
embedding_dim=64

model=Sequential([
    Embedding(max_words, embedding_dim, input_length=maxlen),
    LSTM(128),
    Dense(1, activation='relu')


])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history=model.fit(x_train_padded , y_train, epochs=5, validation_data=(x_test_padded,y_test))

Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 342ms/step - accuracy: 0.4889 - loss: 1.4669 - val_accuracy: 0.5025 - val_loss: 0.6973
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 336ms/step - accuracy: 0.6341 - loss: 0.6282 - val_accuracy: 0.5275 - val_loss: 0.6961
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 360ms/step - accuracy: 0.7468 - loss: 0.4985 - val_accuracy: 0.6200 - val_loss: 1.2794
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 331ms/step - accuracy: 0.9328 - loss: 0.3935 - val_accuracy: 0.6000 - val_loss: 5.3844
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 334ms/step - accuracy: 0.7863 - loss: 0.9234 - val_accuracy: 0.6075 - val_loss: 1.8381


In [None]:
loss, accuracy= model.evaluate(x_test_padded, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 1.8381
Test Accuracy: 0.6075


In [None]:
sample_text="This movie is fantastic!"
sample_sequence= tokenizer.texts_to_sequences([sample_text])
sample_padded= pad_sequences(sample_sequence, maxlen= maxlen, padding='post', truncating='post')
prediction= model.predict(sample_padded)
sentiment= "Positive" if prediction[0][0] >0.37 else "Negative"

print(f"\nSample Text: '{sample_text}'")
print(f"Predicted Sentiment: {sentiment} (Confidence: {prediction [0][0]:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step

Sample Text: 'This movie is fantastic!'
Predicted Sentiment: Positive (Confidence: 6.8950)


In [None]:
sample_text="This movie is worst"
sample_sequence= tokenizer.texts_to_sequences([sample_text])
sample_padded= pad_sequences(sample_sequence, maxlen= maxlen, padding='post', truncating='post')
prediction= model.predict(sample_padded)
sentiment= "Positive" if prediction[0][0] >0.37 else "Negative"

print(f"\nSample Text: '{sample_text}'")
print(f"Predicted Sentiment: {sentiment} (Confidence: {prediction [0][0]:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step

Sample Text: 'This movie is worst'
Predicted Sentiment: Positive (Confidence: 6.8950)
