#### HABUMUGISHA Emmanuel
###### 225229109

### PDL Lab16: Design of LSTM and GRU RNN for classification of IMDB reviews

In [1]:
#Step-1

In [2]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re
from tensorflow.keras.layers import Bidirectional

In [3]:
data = pd.read_csv('IMDB Dataset.csv')

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [4]:
#Step-2

In [5]:
english_stops = set(stopwords.words('english'))

In [6]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [7]:
#Step-3

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
7484     [far, sprightly, less, stage, set, bound, gene...
11986    [a, strange, relationship, middle, aged, woman...
48721    [says, andy, nobody, gets, hurt, everybody, wi...
27173    [years, ago, i, caught, fairly, well, made, tv...
38456    [when, i, started, watching, i, instantly, not...
                               ...                        
4290     [i, found, little, gem, extra, feature, dvd, v...
13124    [a, year, old, kid, fed, parents, arguing, dec...
27777    [although, never, say, never, again, nsna, wea...
11601    [yes, i, watch, show, because, girlfriend, wat...
2468     [unbelievably, close, real, life, feelings, em...
Name: review, Length: 40000, dtype: object 

4871     [a, antique, shop, owner, nyc, played, joanne,...
4030     [watching, film, action, rather, waste, time, ...
22854    [a, wonderful, movie, anyone, growing, italian...
4805     [i, happened, den, morning, scene, ed, engaged...
13926    [this, interesting, project, could, quite, bri...
 

In [9]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [10]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  131 18203   247 ...     4  1416   151]
 [   39   585   543 ...   993     8  3505]
 [  452  2012  1132 ...   415  1716  1518]
 ...
 [  164    42    57 ...     0     0     0]
 [  320     1    33 ...  1118     1    31]
 [ 3809   402    64 ...     0     0     0]] 

Encoded X Test
 [[   39 16916  1814 ...     0     0     0]
 [   65     4   114 ...     0     0     0]
 [   39   297     3 ...     0     0     0]
 ...
 [    8   975  6949 ...  2677   113    14]
 [ 3837   722     3 ...     0     0     0]
 [ 1820  1694   183 ...     0     0     0]] 

Maximum review length:  130


In [11]:
#Step-4

In [12]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2963616   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2992673 (11.42 MB)
Trainable params: 2992673 (11.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [13]:
model.fit(x_train, y_train, batch_size = 128, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1f318c39af0>

In [14]:
model.evaluate(x_test,y_test)



[0.5403304696083069, 0.8611999750137329]

In [15]:
#Step-5

In [16]:
# ARCHITECTURE
EMBED_DIM = 32

model1 = Sequential()
model1.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model1.add(LSTM(32))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 130, 32)           2963616   
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense_2 (Dense)             (None, 64)                2112      
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2978273 (11.36 MB)
Trainable params: 2978273 (11.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [17]:
model1.fit(x_train, y_train, batch_size = 128, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1f318931c40>

In [None]:
#Step-6

In [20]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model2 = Sequential()
model2.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model2.add(Bidirectional(LSTM(LSTM_OUT)))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model2.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 130, 32)           2963616   
                                                                 
 bidirectional (Bidirection  (None, 128)               49664     
 al)                                                             
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3021601 (11.53 MB)
Trainable params: 3021601 (11.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [21]:
model2.fit(x_train, y_train, batch_size = 128)



<keras.src.callbacks.History at 0x1f32f00b250>