In [1]:
import pandas as pd # to load and process dataset
import numpy as np #for mathematic equation
from nltk.corpus import stopwords # to get collection of stopwords
from sklearn.model_selection import train_test_split # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer # encode text to  int
from tensorflow.keras.preprocessing.sequence import pad_sequences #to do padding or truncating
from tensorflow.keras.models import Sequential #the linking of layers in model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of architecture
from tensorflow.keras.callbacks import ModelCheckpoint # save model
from tensorflow.keras.models import load_model #load saved model
import re # for using regular expression


In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [4]:
english_stops = set(stopwords.words('english'))

In [5]:
def load_dataset(x_data,y_data):

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data


In [6]:
x_data, y_data = load_dataset(df['review'],df['sentiment'])

  y_data = y_data.replace('negative', 0)


In [7]:
print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [8]:
x_train , x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.2)
print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
20209    [real, life, husband, wife, paul, bettany, jen...
31071    [first, i, like, say, i, love, ladies, man, sk...
38516    [after, book, i, became, sad, i, watching, mov...
23406    [being, i, fan, snoop, dogg, actor, made, even...
47238    [these, days, asian, horror, films, among, bes...
                               ...                        
27390    [mel, brooks, really, outdid, hilarious, stand...
3425     [this, movie, good, great, good, it, based, on...
4794     [the, acting, sub, par, you, costas, mandalar,...
36786    [i, surprised, soderbergh, pressured, avoid, m...
32091    [whoever, likened, one, raiders, of, the, lost...
Name: review, Length: 40000, dtype: object 

25584    [oscar, caliber, performance, peter, falk, osc...
29693    [g, m, started, odd, couple, downstairs, man, ...
33819    [this, grainy, film, cult, following, one, wor...
28549    [the, plot, straightforward, old, man, living,...
24988    [lotsa, action, cheesy, love, story, unexpecte...
 

In [9]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))
    return int(np.ceil(np.mean(review_length)))

In [10]:
# encode review 
token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train,maxlen=max_length,padding='post', truncating='post')
x_test = pad_sequences(x_test,maxlen=max_length,padding='post', truncating='post')

total_words = len(token.word_index)+1

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   64    43   527 ...   116  4101 11542]
 [   23     1     6 ...   650   567  6841]
 [  298   173     1 ...     0     0     0]
 ...
 [    2    44  1300 ...     0     0     0]
 [    1   662  6967 ...     0     0     0]
 [ 2449 18418     5 ...    36 16487    11]] 

Encoded X Test
 [[  697  5106   147 ...     0     0     0]
 [ 1066  1630   565 ...     0     0     0]
 [    8  5185     4 ...     0     0     0]
 ...
 [  405 91149  3987 ...     0     0     0]
 [   55     1    40 ...   314  1673  1759]
 [  970   380     8 ...  1133   417    94]] 

Maximum review length:  130


In [11]:

# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=EMBED_DIM,input_shape = (max_length,)))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

  super().__init__(**kwargs)


None


In [12]:
checkpoint = ModelCheckpoint(
    'models/LSTM.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [16]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step - accuracy: 0.5720 - loss: 0.6807
Epoch 1: accuracy did not improve from 0.70702
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 175ms/step - accuracy: 0.5720 - loss: 0.6806
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step - accuracy: 0.6609 - loss: 0.6119
Epoch 2: accuracy did not improve from 0.70702
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 188ms/step - accuracy: 0.6608 - loss: 0.6120
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.6617 - loss: 0.5907
Epoch 3: accuracy improved from 0.70702 to 0.72080, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 129ms/step - accuracy: 0.6619 - loss: 0.5905
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.7811 - loss: 0.4719


<keras.src.callbacks.history.History at 0x1970a6b6b70>

In [17]:
y_pred = (np.round(model.predict(x_test, batch_size = 128))).astype(int)
true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step
Correct Prediction: 8222
Wrong Prediction: 1778
Accuracy: 82.22


In [18]:
loaded_model = load_model('models/LSTM.keras', compile=False)

In [19]:
loaded_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
review = str(input('Movie Review: '))
review

'After a long time, Kannada FILM industry found a comedy family entertainer.Assurance made by Ganesh that, industry still alive. Thanks to team for making good movie.please see in theatre for beautiful music'

In [21]:

# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  After a long time Kannada FILM industry found a comedy family entertainerAssurance made by Ganesh that industry still alive Thanks to team for making good movieplease see in theatre for beautiful music
Filtered:  ['after long time kannada film industry found comedy family entertainerassurance made ganesh industry still alive thanks team making good movieplease see theatre beautiful music']


In [22]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  298   104    10 67214     4  1395   162   109   136    24 55506  1395
     58  1035  1111   602   137     9    15  1602   221   115     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


In [23]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step
[[0.8806467]]


In [24]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
