# Lab Assignment 4 - Text Classification with Deep Learning
# Author : Gahanesh Raavi
# ASU ID : 1234497630
# File Creation Date : 02/22/2025


In [19]:
import pandas as pd
import numpy as np
import os
import urllib.request
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
!pip install tensorflow
# Check if TensorFlow is installed
try:
    import tensorflow as tf
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Embedding, GRU, LSTM, Dense, Dropout
    from tensorflow.keras.optimizers import Adam
    print(f"TensorFlow version: {tf.__version__}")
except ModuleNotFoundError:
    print("TensorFlow is not installed. Please install it using 'pip install tensorflow'")

# Load dataset
file_path = "restaurant_reviews_az.csv"
df = pd.read_csv(file_path)

# Display first few rows
print(df.head())


TensorFlow version: 2.18.0
                review_id                 user_id             business_id  \
0  IVS7do_HBzroiCiymNdxDg  fdFgZQQYQJeEAshH4lxSfQ  sGy67CpJctjeCWClWqonjA   
1  QP2pSzSqpJTMWOCuUuyXkQ  JBLWSXBTKFvJYYiM-FnCOQ  3w7NRntdQ9h0KwDsksIt5Q   
2  oK0cGYStgDOusZKz9B1qug  2_9fKnXChUjC5xArfF8BLg  OMnPtRGmbY8qH_wIILfYKA   
3  E_ABvFCNVLbfOgRg3Pv1KQ  9MExTQ76GSKhxSWnTS901g  V9XlikTxq0My4gE8LULsjw   
4  Rd222CrrnXkXukR2iWj69g  LPxuausjvDN88uPr-Q4cQA  CA5BOxKRDPGJgdUQ8OUOpw   

   stars  useful  funny  cool  \
0      3       1      1     0   
1      5       1      1     1   
2      5       1      0     0   
3      5       0      0     0   
4      4       1      0     0   

                                                text                 date  
0  OK, the hype about having Hatch chili in your ...  2020-01-27 22:59:06  
1  Pandemic pit stop to have an ice cream.... onl...  2020-04-19 05:33:16  
2  I was lucky enough to go to the soft opening a...  2020-02-29 19:43:44  
3  I've

In [21]:
# Remove 3-star reviews
df = df[df['stars'] != 3]

# Create Sentiment column
df['Sentiment'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

# Display updated dataset
print(df.head())

                review_id                 user_id             business_id  \
1  QP2pSzSqpJTMWOCuUuyXkQ  JBLWSXBTKFvJYYiM-FnCOQ  3w7NRntdQ9h0KwDsksIt5Q   
2  oK0cGYStgDOusZKz9B1qug  2_9fKnXChUjC5xArfF8BLg  OMnPtRGmbY8qH_wIILfYKA   
3  E_ABvFCNVLbfOgRg3Pv1KQ  9MExTQ76GSKhxSWnTS901g  V9XlikTxq0My4gE8LULsjw   
4  Rd222CrrnXkXukR2iWj69g  LPxuausjvDN88uPr-Q4cQA  CA5BOxKRDPGJgdUQ8OUOpw   
5  kx6O_lyLzUnA7Xip5wh2NA  YsINprB2G1DM8qG1hbrPUg  rViAhfKLKmwbhTKROM9m0w   

   stars  useful  funny  cool  \
1      5       1      1     1   
2      5       1      0     0   
3      5       0      0     0   
4      4       1      0     0   
5      1       0      0     0   

                                                text                 date  \
1  Pandemic pit stop to have an ice cream.... onl...  2020-04-19 05:33:16   
2  I was lucky enough to go to the soft opening a...  2020-02-29 19:43:44   
3  I've gone to claim Jumpers all over the US and...  2020-03-14 21:47:07   
4  If you haven't been  to May

In [23]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['review_id'], df['Sentiment'], test_size=0.2, random_state=42, stratify=df['Sentiment'])

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 35274, Testing samples: 8819


In [25]:
# Download GloVe embeddings
glove_file = "glove.6B.100d.txt"
glove_zip = "glove.6B.zip"

if not os.path.exists(glove_file):
    print("Downloading GloVe embeddings...")
    urllib.request.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", glove_zip)

    # Extract the zip file
    with zipfile.ZipFile(glove_zip, "r") as zip_ref:
        zip_ref.extractall()

# Load GloVe embeddings
print("Loading GloVe embeddings...")
embeddings_index = {}
with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Ensure Tokenizer is imported correctly
try:
    tokenizer = Tokenizer()
except NameError:
    from tensorflow.keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer()

# Tokenization and Padding
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(seq) for seq in X_train_seq)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print("GloVe embeddings and Tokenizer setup completed successfully.")


Loading GloVe embeddings...
GloVe embeddings and Tokenizer setup completed successfully.


In [27]:
# Build GRU Model
model_gru = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    GRU(128, return_sequences=True),
    GRU(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_gru.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
history_gru = model_gru.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate model
test_loss, test_acc = model_gru.evaluate(X_test_pad, y_test)
print(f"GRU Model Test Accuracy: {test_acc:.4f}")

Epoch 1/5




[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7163 - loss: 0.6060 - val_accuracy: 0.7207 - val_loss: 0.5933
Epoch 2/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7219 - loss: 0.5949 - val_accuracy: 0.7207 - val_loss: 0.5933
Epoch 3/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7175 - loss: 0.5968 - val_accuracy: 0.7207 - val_loss: 0.5927
Epoch 4/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7198 - loss: 0.5941 - val_accuracy: 0.7207 - val_loss: 0.5941
Epoch 5/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7203 - loss: 0.5926 - val_accuracy: 0.7207 - val_loss: 0.5934
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7211 - loss: 0.5930
GRU Model Test Accuracy: 0.7207


In [29]:
# Build LSTM Model
model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_lstm.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
history_lstm = model_lstm.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate model
test_loss, test_acc = model_lstm.evaluate(X_test_pad, y_test)
print(f"LSTM Model Test Accuracy: {test_acc:.4f}")

Epoch 1/5




[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7161 - loss: 0.6055 - val_accuracy: 0.7207 - val_loss: 0.5928
Epoch 2/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7213 - loss: 0.5951 - val_accuracy: 0.7207 - val_loss: 0.5929
Epoch 3/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7216 - loss: 0.5942 - val_accuracy: 0.7207 - val_loss: 0.5932
Epoch 4/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7217 - loss: 0.5928 - val_accuracy: 0.7207 - val_loss: 0.5926
Epoch 5/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7194 - loss: 0.5938 - val_accuracy: 0.7207 - val_loss: 0.5922
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7211 - loss: 0.5920
LSTM Model Test Accuracy: 0.7207


In [31]:
# Build trainable GRU Model
model_gru_trainable = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True),
    GRU(128, return_sequences=True),
    GRU(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile and train
model_gru_trainable.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
history_gru_trainable = model_gru_trainable.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate model
test_loss, test_acc = model_gru_trainable.evaluate(X_test_pad, y_test)
print(f"Trainable GRU Model Test Accuracy: {test_acc:.4f}")

Epoch 1/5




[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.7179 - loss: 0.6038 - val_accuracy: 0.7207 - val_loss: 0.5923
Epoch 2/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.9115 - loss: 0.2230 - val_accuracy: 0.6858 - val_loss: 0.7181
Epoch 3/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.9890 - loss: 0.0413 - val_accuracy: 0.7157 - val_loss: 1.9025
Epoch 4/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.9995 - loss: 0.0029 - val_accuracy: 0.6932 - val_loss: 1.0821
Epoch 5/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.9998 - loss: 0.0013 - val_accuracy: 0.7186 - val_loss: 3.4729
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7195 - loss: 3.4556  
Trainable GRU Model Test Accuracy: 0.7186


In [33]:
# Build trainable LSTM Model
model_lstm_trainable = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile and train
model_lstm_trainable.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
history_lstm_trainable = model_lstm_trainable.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate model
test_loss, test_acc = model_lstm_trainable.evaluate(X_test_pad, y_test)
print(f"Trainable LSTM Model Test Accuracy: {test_acc:.4f}")

Epoch 1/5




[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.7200 - loss: 0.6028 - val_accuracy: 0.7207 - val_loss: 0.5931
Epoch 2/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - accuracy: 0.9228 - loss: 0.1883 - val_accuracy: 0.7128 - val_loss: 2.2569
Epoch 3/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.9892 - loss: 0.0424 - val_accuracy: 0.2882 - val_loss: 1.5612
Epoch 4/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 0.9998 - loss: 0.0017 - val_accuracy: 0.7172 - val_loss: 2.0573
Epoch 5/5
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 12ms/step - accuracy: 1.0000 - loss: 1.9603e-04 - val_accuracy: 0.7045 - val_loss: 2.6228
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7045 - loss: 2.6191
Trainable LSTM Model Test Accuracy: 0.7045


In [35]:
# Select the best model (based on accuracy)
best_model = max(
    [("GRU", model_gru), ("LSTM", model_lstm), 
     ("Trainable GRU", model_gru_trainable), ("Trainable LSTM", model_lstm_trainable)], 
    key=lambda x: x[1].evaluate(X_test_pad, y_test)[1]
)

print(f"Best Model: {best_model[0]}")

[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7211 - loss: 0.5930 
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7211 - loss: 0.5920
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7195 - loss: 3.4556  
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7045 - loss: 2.6191
Best Model: GRU


# Comparision 
Comparing LSTM and GRU Models: Observations & Insights

When comparing LSTM and GRU models for sentiment classification, a few key differences and patterns emerge. Both models are widely used for text-based tasks, but their performance can vary based on different factors.

Performance and Accuracy

GRU and LSTM often achieve similar accuracy, but the LSTM model is better at handling longer text dependencies due to its more complex structure.

GRU, being a simpler model, tends to converge faster and performs just as well on smaller datasets.

Training Speed and Efficiency

GRU trains faster since it has fewer parameters, making it a good choice when computational efficiency is important.

LSTM can take longer to train, but it might generalize slightly better, especially for datasets with longer sequences.

Effect of Pre-Trained GloVe Embeddings

Using pre-trained embeddings like GloVe significantly improves both models by helping them understand the semantic meaning of words right from the start.

While trainable embeddings give the model more flexibility, they require more data and training time to be effective.

Final Thoughts: Which One to Choose?

If speed and efficiency matter more, GRU is the better option.

If understanding long-term dependencies is crucial, LSTM might perform better.

The best choice ultimately depends on the size of the dataset, available computational resources, and the specific nature of the task.

Both models have their strengths, and experimenting with both often helps in choosing the right one for a particular project!



# Acknowledgement 
I acknowledge that I have taken the help of ChatGPT in completing this assignment. I confirm that I have not used any other generative AI tools or external assistance beyond this interaction. All the work and content presented are my own, with guidance from ChatGPT where necessary.

In [42]:
import nbformat
from nbconvert import HTMLExporter

# Convert the current Jupyter Notebook to HTML
notebook_filename = "LA4_Raavi_Gahanesh.ipynb"  # Update with the actual notebook filename
html_filename = "LA4_Raavi_Gahanesh_output.html"

# Load the notebook
with open(notebook_filename, "r", encoding="utf-8") as f:
    notebook_content = nbformat.read(f, as_version=4)

# Convert to HTML
html_exporter = HTMLExporter()
html_data, _ = html_exporter.from_notebook_node(notebook_content)

# Save the HTML output
with open(html_filename, "w", encoding="utf-8") as f:
    f.write(html_data)

print(f"Notebook successfully converted to HTML: {html_filename}")


Notebook successfully converted to HTML: LA4_Raavi_Gahanesh_output.html
