In [1]:
import numpy as np
import pandas as pd
import re
import nltk

In [2]:
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [5]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [7]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
x_data = data['review']
y_data = data['sentiment']

In [9]:
x_data

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...
49995,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...


In [10]:
x_data = x_data.replace({'<.*>':""}, regex = True)        # Remove html tags from review
tokenizer = RegexpTokenizer(r'\w+')                       # Tokenizing only words
x_lower = x_data.apply(lambda x : x.lower())
x_lower = x_data.apply(lambda x : re.sub(r"\d+","",x))    # Remove all numerical values from review
x_tokens = x_lower.apply(lambda x : tokenizer.tokenize(x))

In [11]:
x_tokens = x_tokens.apply(lambda tokens : [token for token in tokens if token.isalpha()])

In [12]:
stop_words = stopwords.words('english')
x_clean_tokens = x_tokens.apply(lambda tokens : [token for token in tokens if token not in stop_words])

In [13]:
x_clean_tokens

Unnamed: 0,review
0,"[One, reviewers, mentioned, watching, Oz, epis..."
1,"[A, wonderful, little, production, The, realis..."
2,"[I, thought, wonderful, way, spend, time, hot,..."
3,"[Basically, family, little, boy, Jake, thinks,..."
4,"[Petter, Mattei, Love, Time, Money, visually, ..."
...,...
49995,"[I, thought, movie, right, good, job, It, crea..."
49996,"[Bad, plot, bad, dialogue, bad, acting, idioti..."
49997,"[I, Catholic, taught, parochial, elementary, s..."
49998,"[I, going, disagree, previous, comment, side, ..."


In [14]:
y_data = y_data.replace('positive', 1)
y_data = y_data.replace('negative', 0)

  y_data = y_data.replace('negative', 0)


#### Split Data into Train - Test

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_clean_tokens, y_data, test_size = 0.2, random_state = 2)

#### Encoding Review

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint

In [17]:
token = Tokenizer(lower = False)
token.fit_on_texts(x_train)
# Embeddings from input text data
x_train_seq = token.texts_to_sequences(x_train)
x_test_seq = token.texts_to_sequences(x_test)

In [18]:
# create a function for getting maximum review length
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))
    return int(np.ceil(np.mean(review_length)))

In [19]:
max_length = get_max_length()
x_train_final = pad_sequences(x_train_seq, maxlen = max_length, padding='post', truncating='post')
x_test_final = pad_sequences(x_test_seq, maxlen = max_length, padding='post', truncating='post')

In [20]:
x_train_final

array([[ 3739,   122,     4, ..., 15820,  8637,   133],
       [  515,   390,  1055, ...,     0,     0,     0],
       [12267, 12663,   190, ...,   366,   794,  4552],
       ...,
       [ 1256,   131,   636, ...,    76,   506,  1311],
       [ 9276,  3738, 32518, ..., 29424,    75, 27184],
       [    1,   201,  3968, ...,   304,  8356,  2085]], dtype=int32)

In [21]:
total_words = len(token.word_index) + 1

In [22]:
print(max_length)

77


#### Build LSTM Model

In [23]:
EMBED_DIM = 32      # Embedding size (Each setence will be having 32 length of vector)
LSTM_OUT = 64       # No. of Layers of LSTM Model
model = Sequential()
# Text data to vectorized form of data
model.add(Embedding(total_words, EMBED_DIM))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
print(model.summary())

None


In [24]:
filepath = "/content/LSTM.keras"
checkpoint = ModelCheckpoint(filepath, monitor = "accuracy", save_best_only = True, verbose = 1)
callbacks_list = [checkpoint]

In [25]:
model.fit(x_train_final, y_train, batch_size = 64, epochs = 20, callbacks = [checkpoint])

Epoch 1/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.6839 - loss: 0.5557
Epoch 1: accuracy improved from -inf to 0.77463, saving model to /content/LSTM.keras
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 75ms/step - accuracy: 0.6840 - loss: 0.5555
Epoch 2/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.9080 - loss: 0.2453
Epoch 2: accuracy improved from 0.77463 to 0.90425, saving model to /content/LSTM.keras
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 76ms/step - accuracy: 0.9080 - loss: 0.2453
Epoch 3/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.9510 - loss: 0.1419
Epoch 3: accuracy improved from 0.90425 to 0.94073, saving model to /content/LSTM.keras
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 78ms/step - accuracy: 0.9509 - loss: 0.1419
Epoch 4/20
[1m625/625[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7be02d01db50>

In [26]:
model.load_weights('/content/LSTM.keras')

In [27]:
y_pred = model.predict(x_test_final, batch_size = 64)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step


In [28]:
yhat = np.round(y_pred)

#### Model Evaluation

In [30]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, yhat)

array([[3946, 1033],
       [ 767, 4254]])

In [32]:
print(classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.84      0.79      0.81      4979
           1       0.80      0.85      0.83      5021

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



#### Make New Prediction

In [33]:
review = str(input("Movie Review : "))

Movie Review : "Eternal Horizon" is a breathtaking sci-fi masterpiece that blends stunning visuals with a deeply emotional storyline. The film follows Dr. Lila Carter, a brilliant but conflicted scientist who embarks on a journey through time to prevent a cosmic catastrophe. The cinematography is mesmerizing, capturing both the vastness of space and the intimacy of human emotion. The performances, particularly by Emma Clarke in the lead role, are outstanding. The script is intelligent and thought-provoking, making you question the nature of destiny and sacrifice. With a gripping score and seamless special effects, "Eternal Horizon" is a must-watch for sci-fi lovers.


In [36]:
def review_sentiment(review):
    review_series = pd.Series(review)
    input_data = review_series.replace({'<.*>':""}, regex = True)       # remove html tags from review
    tokenizer = RegexpTokenizer(r"\w+")
    input_lower = input_data.apply(lambda x : x.lower())
    input_tokens = input_lower.apply(lambda x : re.sub(r"\d+","",x))     # remove all numerical value from reviews
    input_tokens = input_tokens.apply(lambda x : tokenizer.tokenize(x))
    input_tokens = input_tokens.apply(lambda tokens : [token for token in tokens if token.isalpha()])
    clean_tokens = input_tokens.apply(lambda tokens : [token for token in tokens if token not in stop_words])
    input_tokens_seq = token.texts_to_sequences(input_tokens)
    final_input = pad_sequences(input_tokens_seq, maxlen = max_length, padding = 'post', truncating = 'post')
    pred_output = model.predict(final_input)
    pred_output = np.round(pred_output)
    return pred_output

In [37]:
review_sentiment(review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step


array([[1.]], dtype=float32)

In [38]:
review = str(input("Movie Review : "))

Movie Review : "Shadow's Revenge" tries to be a gripping thriller but ends up being a convoluted mess. The plot revolves around a detective investigating a series of supernatural murders, but the story is riddled with clichés and incoherent twists. The pacing is sluggish, making it hard to stay engaged, and the dialogue feels unnatural. Despite an interesting premise, the film fails to build tension or deliver any real surprises. The visual effects are unimpressive, and the acting feels forced, with the protagonist lacking any real charisma. Overall, "Shadow's Revenge" is a forgettable film that fails to leave an impact.


In [39]:
review_sentiment(review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step


array([[0.]], dtype=float32)