In [None]:
pip install tensorflow

In [1]:
import tensorflow as tf




**TensorFlow** is a deep learning/machine learning library developed by Google

In [2]:
import pandas as pd
import glob

import numpy as np                  # Library for mathematical operations
from nltk.corpus import stopwords   # Import list of English stopwords (common words like 'the', 'is')

from tensorflow.keras.preprocessing.text import Tokenizer  # Tool to convert text to integer indices
from tensorflow.keras.preprocessing.sequence import pad_sequences   # Padding/truncating function to match sequence lengths
from tensorflow.keras.models import Sequential      # Create deep learning model structure
from tensorflow.keras.layers import Embedding, LSTM, Dense # Embedding, LSTM, Dense layers used in the model
from tensorflow.keras.callbacks import ModelCheckpoint    # Callback to save the model during training
from tensorflow.keras.models import load_model    # Load a saved model
import re    # Library for processing regular expressions

# Data Preparation

### Positive Data

In [3]:
# train
pos_train_review=(glob.glob(r"...\aclImdb\train\pos\*.txt"))
# test
pos_test_review=(glob.glob(r"...\aclImdb\test\pos\*.txt"))

In [4]:
# train
lines_train_pos = []
for i in pos_train_review:
    try:
        f = open(i, 'r', encoding='latin-1')
        temp = f.readlines()[0]
        lines_train_pos.append(temp)
        f.close()
    except Exception as e:
        continue            
        
# test
lines_test_pos = []
for i in pos_test_review:
    try:
        f = open(i, 'r', encoding='latin-1')
        temp = f.readlines()[0]
        lines_test_pos.append(temp)
        f.close()
    except Exception as e:
        continue    

In [5]:
# Convert list to DataFrame
lines_train_pos = pd.DataFrame(lines_train_pos, columns=['content'])
lines_test_pos = pd.DataFrame(lines_test_pos, columns=['content'])

In [6]:
lines_train_pos

Unnamed: 0,content
0,Bromwell High is a cartoon comedy. It ran at t...
1,Homelessness (or Houselessness as George Carli...
2,Brilliant over-acting by Lesley Ann Warren. Be...
3,This is easily the most underrated film inn th...
4,This is not the typical Mel Brooks film. It wa...
...,...
12495,"Seeing as the vote average was pretty low, and..."
12496,"The plot had some wretched, unbelievable twist..."
12497,I am amazed at how this movie(and most others ...
12498,A Christmas Together actually came before my t...


### Labeling Positive Data

In [7]:
# train
lines_train_pos['label'] = 1 # Here, 1 represents Positive

# test
lines_test_pos['label'] = 1

In [8]:
lines_train_pos

Unnamed: 0,content,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
12495,"Seeing as the vote average was pretty low, and...",1
12496,"The plot had some wretched, unbelievable twist...",1
12497,I am amazed at how this movie(and most others ...,1
12498,A Christmas Together actually came before my t...,1


### Negative data

In [9]:
#train
neg_train_review=(glob.glob(r"...\aclImdb\train\neg\*.txt"))
#test
neg_test_review=(glob.glob(r"...\aclImdb\test\neg\*.txt"))

In [10]:
# train
lines_train_neg = []
for i in neg_train_review:
    try:
        f = open(i, 'r', encoding='latin-1')
        temp = f.readlines()[0]
        lines_train_neg.append(temp)
        f.close()
    except Exception as e:
        continue        
        
        
# test
lines_test_neg = []
for i in neg_test_review:
    try:
        f = open(i, 'r', encoding='latin-1')
        temp = f.readlines()[0]
        lines_test_neg.append(temp)
        f.close()
    except Exception as e:
        continue   

In [11]:
lines_train_neg = pd.DataFrame(lines_train_neg, columns=['content'])
lines_test_neg = pd.DataFrame(lines_test_neg, columns=['content'])

In [12]:
# train
lines_train_neg['label'] = 0 # Here, 0 represents Negative

# test
lines_test_neg['label'] = 0

In [13]:
lines_train_neg

Unnamed: 0,content,label
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0
...,...,...
12495,"Towards the end of the movie, I felt it was to...",0
12496,This is the kind of movie that my enemies cont...,0
12497,I saw 'Descent' last night at the Stockholm Fi...,0
12498,Some films that you pick up for a pound turn o...,0


### Merge positive/negative data

In [14]:
total_train_text=pd.concat([lines_train_pos,lines_train_neg],axis=0)
total_test_text=pd.concat([lines_test_pos,lines_test_neg],axis=0)

- axis=0 → combine DataFrames vertically by adding more rows.
This stacks one dataset under the other.

- axis=1 → combine DataFrames horizontally by adding more columns.

So here, using axis=0 means:
We will place the negative reviews below the positive reviews, forming one unified dataset.

In [15]:
print(total_train_text.head(5))
print(total_test_text.tail(5))

                                             content  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1
                                                 content  label
12495  I occasionally let my kids watch this garbage ...      0
12496  When all we have anymore is pretty much realit...      0
12497  The basic genre is a thriller intercut with an...      0
12498  Four things intrigued me as to this film - fir...      0
12499  David Bryce's comments nearby are exceptionall...      0


### Data Preprocessing

In [16]:
# Define Stopwords
english_stops = set(stopwords.words('english'))

In [17]:
# Split Input (X) and Target (y)

# train
X_train_data = total_train_text['content']
y_train_data = total_train_text['label']

# test
X_test_data = total_test_text['content']
y_test_data = total_test_text['label']

In [18]:
# Text Cleaning

# train 
X_train_data = X_train_data.replace({'<.*?>': ''}, regex=True)          # Remove HTML tags
X_train_data = X_train_data.replace({'[^A-Za-z]': ' '}, regex=True)     # Remove non-alphabet characters (numbers, special chars)
X_train_data = X_train_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # Remove stopwords
X_train_data = X_train_data.apply(lambda review: [w.lower() for w in review])   # Convert to lowercase

# test 
X_test_data = X_test_data.replace({'<.*?>': ''}, regex=True)          
X_test_data = X_test_data.replace({'[^A-Za-z]': ' '}, regex=True)      
X_test_data = X_test_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) 
X_test_data = X_test_data.apply(lambda review: [w.lower() for w in review])

- Garbage In, Garbage Out: If data is not clean, model performance drops.
- HTML tags and special characters are noise irrelevant to sentiment analysis, so they must be removed.

In [20]:
print(X_train_data.head(5))
print(X_test_data.head(5))

0    [bromwell, high, cartoon, comedy, it, ran, tim...
1    [homelessness, houselessness, george, carlin, ...
2    [brilliant, acting, lesley, ann, warren, best,...
3    [this, easily, underrated, film, inn, brooks, ...
4    [this, typical, mel, brooks, film, it, much, l...
Name: content, dtype: object
0    [i, went, saw, movie, last, night, coaxed, fri...
1    [actor, turned, director, bill, paxton, follow...
2    [as, recreational, golfer, knowledge, sport, h...
3    [i, saw, film, sneak, preview, delightful, the...
4    [bill, paxton, taken, true, story, us, golf, o...
Name: content, dtype: object


In [21]:
print(y_train_data.tail(5))
print(y_test_data.tail(5))

12495    0
12496    0
12497    0
12498    0
12499    0
Name: label, dtype: int64
12495    0
12496    0
12497    0
12498    0
12499    0
Name: label, dtype: int64


**Calculate Review Max Length**: To feed data into a Deep Learning model, all reviews must be the same length

In [22]:
# Function to calculate average review length from train data -> inputs must be identical in size

def get_max_length():
    review_length = []
    for review in X_train_data:
        review_length.append(len(review))

    # If we set it to the longest review, we get too many 0s (sparse), reducing efficiency. 
    # So we use the average (np.mean) and round up (np.ceil).
    return int(np.ceil(np.mean(review_length)))

### Tokenization and Padding

In [23]:
# ENCODE REVIEW
token = Tokenizer(lower=False) # Create TensorFlow Tokenizer object -> lower=False because we already lowercased in preprocessing
 
token.fit_on_texts(X_train_data) # Scan all words in training data and assign unique indices based on frequency 

X_train_data = token.texts_to_sequences(X_train_data) # Convert text to integer sequences. Words are mapped to numbers.
X_test_data = token.texts_to_sequences(X_test_data)

max_length = get_max_length() # Call the function created above to get the standard length

X_train_data = pad_sequences(X_train_data, maxlen=max_length, padding='post', truncating='post')
X_test_data = pad_sequences(X_test_data, maxlen=max_length, padding='post', truncating='post')

- pad_sequences settings summary:
- maxlen: Unify all sequences to this length.
- padding='post': Fill 0s at the END if the sequence is shorter.
- truncating='post': Cut off the END if the sequence is longer.

In [24]:
total_words = len(token.word_index) + 1   # Calculate total word count. Add 1 to account for the '0' padding index.

print('Encoded X Train\n', X_train_data, '\n')
print('Encoded X Test\n', X_test_data, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[21002   215   972 ...     0     0     0]
 [22360 46344   620 ...   247  8782  6754]
 [  433    42 16351 ...     0     0     0]
 ...
 [    1   124  4647 ...  1207  1200   228]
 [  394    36  1163 ...     0     0     0]
 [    8     5  6926 ...     0     0     0]] 

Encoded X Test
 [[    1   336   124 ...     0     0     0]
 [  177   581    71 ...  1935  4943   173]
 [  110 30661 45975 ...     0     0     0]
 ...
 [    2  1019   410 ... 34388  7464  1852]
 [  586    90  3620 ...     0     0     0]
 [  507   691  3928 ...     1   166    14]] 

Maximum review length:  132


### Encoded X Train / Encoded X Test
- Each row = One review
- Each number = Unique word Index -> Tokenizer scanned data and assigned IDs
- Why so many 0s at the end?

0 is the PAD token: Added to the end to make all lengths equal.

### Maximum review length: 132 -> Calculated by our get_max_length() function

- ! WARNING: Do not run this cell twice.
- The first run overwrites X_train_data (strings) with a padded integer array.
- If you run it again, Tokenizer.fit_on_texts() will fail because it expects strings, not integers.

# LSTM (Long Short-Term Memory)

LSTM is a type of Recurrent Neural Network (RNN) designed to learn long-range dependencies in sequence data.

In [26]:
import tensorflow as tf # Required for tf.where

In [27]:
EMBED_DIM = 32 # Dimension of the embedding vector (size of the meaning vector for each word)

LSTM_OUT = 64  # Number of neurons (memory cells) responsible for memory inside the LSTM layer

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length=max_length)) 
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid')) # Sigmoid: Outputs a value between 0~1, used for binary classification (probability)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 132, 32)           2374880   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2399777 (9.15 MB)
Trainable params: 2399777 (9.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [29]:
# Model Checkpoint: Save model during training
# save_best_only=True: Only overwrite the saved file if the model improves (higher accuracy)
checkpoint = ModelCheckpoint('models/LSTM2.h5', monitor='accuracy', save_best_only=True, verbose=1)

In [30]:
# Train the model
model.fit(X_train_data, y_train_data, batch_size=128, epochs=3, callbacks=[checkpoint])

Epoch 1/3


Epoch 1: accuracy improved from -inf to 0.64972, saving model to models\LSTM2.h5
Epoch 2/3
  2/196 [..............................] - ETA: 10s - loss: 0.4196 - accuracy: 0.8516

  saving_api.save_model(


Epoch 2: accuracy improved from 0.64972 to 0.90060, saving model to models\LSTM2.h5
Epoch 3/3
Epoch 3: accuracy improved from 0.90060 to 0.95756, saving model to models\LSTM2.h5


<keras.src.callbacks.History at 0x25ef5557bb0>

In [31]:
# Prediction and Result Verification
predictions = model.predict(X_test_data, batch_size=128)
threshold = 0.5 # Threshold line separating Positive and Negative
binary_predictions = tf.where(predictions > threshold, 1, 0) # If probability > 0.5, classify as 1 (Pos), else 0 (Neg)



In [32]:
# Create a table to visually compare Prediction vs Real Answer (Ground Truth)
binary_predictions = pd.DataFrame(binary_predictions.numpy(), columns=['predict'])
binary_predictions['real'] = y_test_data.values

print(binary_predictions.head()) # Display the comparison table

   predict  real
0        1     1
1        1     1
2        0     1
3        1     1
4        1     1


In [33]:
# Final Accuracy Evaluation
accuracy = model.evaluate(X_test_data, y_test_data, batch_size=128)
print(f"Test Accuracy: {accuracy[1]}")

Test Accuracy: 0.8368399739265442


# 3. GRU (Gated Recurrent Unit)

GRU is an RNN-based model. It is a Recurrent Neural Network structure 
that is simpler than LSTM but delivers similar performance.
- Gated: Uses a gate structure to control information flow
- Recurrent: Part of the RNN family processing sequential data (text, audio, time series)
- Unit: The basic repeating computational unit

In [34]:
from tensorflow.keras.layers import GRU, SimpleRNN
import tensorflow as tf
import pandas as pd

In [35]:
# Define GRU Model
model_gru = Sequential()
model_gru.add(Embedding(total_words, EMBED_DIM, input_length=max_length)) 
model_gru.add(tf.keras.layers.GRU(LSTM_OUT))  # Add GRU layer (This is the only line changed from the LSTM code)
model_gru.add(Dense(1, activation='sigmoid'))

model_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model_gru.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 132, 32)           2374880   
                                                                 
 gru (GRU)                   (None, 64)                18816     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2393761 (9.13 MB)
Trainable params: 2393761 (9.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [36]:
# Train GRU
checkpoint_gru = ModelCheckpoint('models/GRU.h5', monitor='accuracy', save_best_only=True, verbose=1)
model_gru.fit(X_train_data, y_train_data, batch_size=128, epochs=5, callbacks=[checkpoint_gru])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.51588, saving model to models\GRU.h5
Epoch 2/5
  3/196 [..............................] - ETA: 9s - loss: 0.6893 - accuracy: 0.5469

  saving_api.save_model(


Epoch 2: accuracy improved from 0.51588 to 0.66088, saving model to models\GRU.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.66088 to 0.89044, saving model to models\GRU.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.89044 to 0.94888, saving model to models\GRU.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.94888 to 0.97664, saving model to models\GRU.h5


<keras.src.callbacks.History at 0x25efa6dd490>

In [37]:
# Predict with GRU
predictions_gru = model_gru.predict(X_test_data, batch_size=128)
binary_predictions_gru = tf.where(predictions_gru > threshold, 1, 0)



In [38]:
# Create comparison table
binary_predictions_gru = pd.DataFrame(binary_predictions_gru.numpy(), columns=['predict'])
binary_predictions_gru['real'] = y_test_data.values

In [39]:
# Display a slice of the data (middle section where positive meets negative)
print(binary_predictions_gru[len(lines_test_pos) - 5 : -len(lines_test_neg) + 5])

       predict  real
12495        1     1
12496        1     1
12497        1     1
12498        1     1
12499        1     1
12500        0     0
12501        0     0
12502        0     0
12503        1     0
12504        1     0


In [40]:
# Evaluate GRU
accuracy_gru = model_gru.evaluate(X_test_data, y_test_data, batch_size=128)
print(f"GRU Test Accuracy: {accuracy_gru[1]}")

GRU Test Accuracy: 0.8388400077819824


# 4. Simple RNN (Simple Recurrent Neural Network)

Simple RNN is the most basic form of a Recurrent Neural Network.
- Recurrent: Passes the hidden state from the previous time step to the next time step.
- Neural Network: Neural network-based structure.

In [41]:
# Define RNN Model
model_rnn = Sequential()
model_rnn.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model_rnn.add(tf.keras.layers.SimpleRNN(LSTM_OUT))  # Add Simple RNN layer
model_rnn.add(Dense(1, activation='sigmoid'))

In [42]:
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model_rnn.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 132, 32)           2374880   
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                6208      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2381153 (9.08 MB)
Trainable params: 2381153 (9.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [43]:
# Train RNN
checkpoint_rnn = ModelCheckpoint('models/RNN.h5', monitor='accuracy', save_best_only=True, verbose=1)
model_rnn.fit(X_train_data, y_train_data, batch_size=128, epochs=5, callbacks=[checkpoint_rnn])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.50468, saving model to models\RNN.h5
Epoch 2/5
  7/196 [>.............................] - ETA: 3s - loss: 0.6739 - accuracy: 0.5904

  saving_api.save_model(


Epoch 2: accuracy improved from 0.50468 to 0.59424, saving model to models\RNN.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.59424 to 0.68904, saving model to models\RNN.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.68904 to 0.74268, saving model to models\RNN.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.74268 to 0.78000, saving model to models\RNN.h5


<keras.src.callbacks.History at 0x25ef58660a0>

In [44]:
# Predict with RNN
predictions_rnn = model_rnn.predict(X_test_data, batch_size=128)
binary_predictions_rnn = tf.where(predictions_rnn > threshold, 1, 0)



In [45]:
# Create comparison table
binary_predictions_rnn = pd.DataFrame(binary_predictions_rnn.numpy(), columns=['predict'])
binary_predictions_rnn['real'] = y_test_data.values

In [46]:
# Display slice at the boundary of positive and negative reviews
print(binary_predictions_rnn[len(lines_test_pos) - 5 : -len(lines_test_neg) + 5])

       predict  real
12495        1     1
12496        0     1
12497        0     1
12498        1     1
12499        0     1
12500        0     0
12501        0     0
12502        1     0
12503        0     0
12504        0     0


In [47]:
# Evaluate RNN
accuracy_rnn = model_rnn.evaluate(X_test_data, y_test_data, batch_size=128)
print(f"RNN Test Accuracy: {accuracy_rnn[1]}")

RNN Test Accuracy: 0.5064799785614014
