In [18]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
import nltk
from nltk.corpus import stopwords # to get collection of stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# importing the dataset
data = pd.read_csv(r"C:\Users\darsh\OneDrive\Desktop\Uni\Projects\AI\Sentimental-analysis-using-LSTM-master\IMDB-Dataset.csv")
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [20]:
english_stops = set(stopwords.words('english'))

In [21]:
def load_dataset():
    df = pd.read_csv(r"C:\Users\darsh\OneDrive\Desktop\Uni\Projects\AI\Sentimental-analysis-using-LSTM-master\IMDB-Dataset.csv")
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [22]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
10606    [tim, krabbe, praised, author, het, gouden, ei...
39200    [i, action, movie, fan, today, i, never, seen,...
4578     [i, loved, thing, the, wonderful, thing, pink,...
43343    [harem, suare, best, film, i, saw, year, bravo...
44342    [grabbed, attention, netflix, instant, play, h...
                               ...                        
23482    [sarah, plain, tall, winters, end, best, movie...
18017    [i, may, biased, i, author, novel, the, hungry...
7755     [beat, strong, deaf, mutants, like, rex, voorh...
21570    [dracula, epitome, painfully, cheesy, cinema, ...
22407    [recap, doctor, markov, developed, new, theory...
Name: review, Length: 40000, dtype: object 

9968     [i, saw, film, premier, sundance, since, ameri...
45430    [shocking, in, i, saw, jury, gagarin, alive, h...
8233     [this, movie, starts, showing, map, explaining...
41914    [the, book, gets, stars, probably, contains, s...
34620    [death, camp, opera, right, here, right, now, ...
 

In [23]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [24]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 1587 19462  5837 ...   594  1262 15804]
 [    1   115     3 ...     0     0     0]
 [    1   341    66 ...     0     0     0]
 ...
 [ 1351   469  5199 ...   140   209    44]
 [ 2397  9145  2179 ...     0     0     0]
 [11657   734 19402 ...   215   118 13888]] 

Encoded X Test
 [[   1  120    4 ... 9510 2967  121]
 [1451   50    1 ...  389   25  123]
 [   8    3  438 ...    0    0    0]
 ...
 [   2  754    3 ...    0    0    0]
 [   2  125 4135 ...  407  277  649]
 [   1   38   17 ...    0    0    0]] 

Maximum review length:  130


In [25]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2957888   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,982,785
Trainable params: 2,982,785
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [27]:
model.fit(x_train, y_train, batch_size = 128, epochs = 6, callbacks=[checkpoint])

Epoch 1/6
Epoch 1: accuracy improved from -inf to 0.75912, saving model to models\LSTM.h5
Epoch 2/6
Epoch 2: accuracy improved from 0.75912 to 0.92343, saving model to models\LSTM.h5
Epoch 3/6
Epoch 3: accuracy improved from 0.92343 to 0.95830, saving model to models\LSTM.h5
Epoch 4/6
Epoch 4: accuracy improved from 0.95830 to 0.97505, saving model to models\LSTM.h5
Epoch 5/6
Epoch 5: accuracy improved from 0.97505 to 0.98408, saving model to models\LSTM.h5
Epoch 6/6
Epoch 6: accuracy improved from 0.98408 to 0.98867, saving model to models\LSTM.h5


<keras.callbacks.History at 0x2498302ee80>

In [28]:
#evaluate our model
result = model.evaluate(x_train, y_train)



In [29]:
loaded_model = load_model('models/LSTM.h5')

In [30]:
review = str(input('Movie Review: '))

Movie Review: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is 

In [31]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  One of the other reviewers has mentioned that after watching just  Oz episode youll be hooked They are right as this is exactly what happened with mebr br The first thing that struck me about Oz was its brutality and unflinching scenes of violence which set in right from the word GO Trust me this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs sex or violence Its is hardcore in the classic use of the wordbr br It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda Em City is home to manyAryans Muslims gangstas Latinos Christians Italians Irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awaybr br I would say the main appeal of the show is due to the fact that it goes where other shows wouldnt

In [32]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[    5  1918  1003    65  2842   295  3155   110   111   541   519 36865
      2    23    66  3125  2842  5148 14821    60   476   172   111   558
     62  1672    46  7648  2231 10714     8    46  2410  5611  5134  1431
    278   476   744  3771   264   261 36865     7   337  2842 11463   259
  13730  6915  2484   954 52679     7  2712  1278 26245   424  4976  2557
   1116  6788  2872 12599   296 71373 16801   204  4936  2945   424   249
   8577 44404 13041  4889  7377  2322 22080 43537   229  8966  6975 12201
   8305 31266    43   132 36865     1    12    58   188  1231    46   590
    102   174   183 24964  2976   737    90  1238  3968  2424  1157   737
   1357   737 15769   867    93     2    23   295     1    51   120  3125
   1538  2184     1 20786    58     1  1472     1   194     1]]


In [33]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.97369325]]


In [34]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
