In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Health_and_Personal_Care.jsonl'

# Load the .jsonl file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the first 5 rows of the DataFrame
print("DataFrame Loaded Successfully!")
df.head()


DataFrame Loaded Successfully!


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True


In [None]:
# number of reviews
print(len(df))

print(list(df.columns))

494121
['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


In [None]:
# count of all ratings to check if there's imbalance
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,301713
1,69564
4,57000
3,36949
2,28895


In [None]:
df.isnull().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Preprocessing

In [None]:
df.dropna(subset=['text', 'rating'], inplace=True)
print(f"\nDataFrame shape after dropping rows with missing text/rating: {df.shape}")

# Combine 'title' and 'text'
# Fill missing titles with an empty string BEFORE concatenation
df['title'] = df['title'].fillna('')
df['review_full'] = df['title'] + ' ' + df['text']

# Text Cleaning Setup
nltk.download('punkt_tab', quiet=True) # Download the specific resource needed
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Lowercasing
    text = text.lower()
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # 3. Remove punctuation and special characters (keeping only letters and whitespace)
    text = re.sub(r'[^a-z\s]', '', text)
    # 4. Tokenization
    tokens = word_tokenize(text)
    # 5. Remove Stop Words and Lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1] # Keep words with length > 1
    # 6. Join back into string
    return ' '.join(cleaned_tokens)

print("\nStarting text cleaning (this may take a while)...")
# Apply cleaning function - Monitor progress if needed for large datasets
# Consider df['review_full'].parallel_apply(clean_text) using libraries like pandarallel if speed is critical
df['review_cleaned'] = df['review_full'].apply(clean_text)
print("Text cleaning completed.")

# Drop rows where cleaning might have resulted in empty strings
df.dropna(subset=['review_cleaned'], inplace=True)
df = df[df['review_cleaned'].str.strip() != ''] # Ensure no empty strings after cleaning
print(f"DataFrame shape after cleaning and removing empty reviews: {df.shape}")


DataFrame shape after dropping rows with missing text/rating: (494121, 10)

Starting text cleaning (this may take a while)...
Text cleaning completed.
DataFrame shape after cleaning and removing empty reviews: (493820, 12)


In [None]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,review_full,review_cleaned
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True,12 mg is 12 on the periodic table people! Mg f...,mg periodic table people mg magnesium review c...
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True,Save the lanet using less plastic. Love these ...,save lanet using less plastic love easy multit...
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True,Fantastic I have been suffering a couple month...,fantastic suffering couple month heel pain pla...
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True,It holds the water and makes bubbles. That's ...,hold water make bubble thats bought cheap want...
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True,Not for me Didn't do a thing for me. Not sayin...,didnt thing saying dont


# Download GloVe embeddings if not already present

In [None]:
# Download and unzip GloVe if not already present
if not os.path.exists('glove.6B.zip'):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

--2025-04-12 20:46:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-04-12 20:46:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-04-12 20:46:44--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

# Load the GloVe embeddings into a dictionary

In [None]:
def load_glove_embeddings(glove_file_path):
    """
    Loads GloVe embeddings from a file into a dictionary.
    Returns a dict mapping 'word' -> embedding (as a NumPy array).
    """
    embeddings_dict = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

# Load GloVe 200d
glove_file = 'glove.6B.200d.txt'  # adjust if you want a different dimension
embeddings_index = load_glove_embeddings(glove_file)
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")


Loaded 400000 word vectors from GloVe.


#  Train/Test Split

In [None]:
# Extract features and labels
X_texts = df['review_cleaned'].values  # the preprocessed reviews
y = df['rating'].values       # the corresponding labels (ratings 1-5)

# Split the data
X_train_texts, X_test_texts, y_train_orig, y_test_orig = train_test_split(
    X_texts,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Number of training samples:", len(X_train_texts))
print("Number of testing samples:", len(X_test_texts))
# Note: y_train_orig and y_test_orig still hold ratings 1-5

Number of training samples: 395056
Number of testing samples: 98764


# Import Keras/TensorFlow Components

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np # Ensure numpy is imported if not already done comprehensively

# Tokenization and Padding

In [None]:
# Define maximum number of words to consider as features
MAX_NUM_WORDS = 20000 # You can adjust this based on your vocabulary size observation
# Define maximum length of sequence (reviews)
MAX_SEQUENCE_LENGTH = 150 # Adjust based on review length analysis if needed

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train_texts) # Fit only on training data

X_train_sequences = tokenizer.texts_to_sequences(X_train_texts)
X_test_sequences = tokenizer.texts_to_sequences(X_test_texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

# --- Padding ---
X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

print('Shape of training data tensor:', X_train_padded.shape)
print('Shape of testing data tensor:', X_test_padded.shape)

Found 149458 unique tokens.
Shape of training data tensor: (395056, 150)
Shape of testing data tensor: (98764, 150)


#  Create GloVe Embedding Matrix

In [None]:
# --- Prepare Embedding Matrix ---
EMBEDDING_DIM = 200 # Should match the GloVe dimension loaded (e.g., 300d)
num_words = min(MAX_NUM_WORDS, len(word_index) + 1) # +1 because index starts from 1

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue # Skip words beyond the limit
    embedding_vector = embeddings_index.get(word) # Get vector from loaded GloVe dict
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print(f"Embedding Matrix shape: {embedding_matrix.shape}")

# Optional: Check how many words were found in GloVe
found_count = np.sum(np.sum(embedding_matrix, axis=1) != 0)
print(f"Found GloVe embeddings for {found_count} out of {num_words} words in the tokenizer vocabulary.")

Embedding Matrix shape: (20000, 200)
Found GloVe embeddings for 17205 out of 20000 words in the tokenizer vocabulary.


# Adjust Labels and Define num_classes

In [None]:
# Adjust labels from 1-5 to 0-4
# Neural networks usually expect classes starting from 0
y_train = y_train_orig - 1
y_test = y_test_orig - 1

# Determine the number of classes
# User requested 9 output neurons, but ratings are 1-5.
# Assuming the goal is to predict the 5 rating classes.
# If you truly need 9 outputs for some other reason, adjust num_classes.
num_classes = len(np.unique(y_train_orig)) # Should be 5
print(f"Number of unique classes: {num_classes}") # Should print 5
print("Labels adjusted to 0-based indexing.")
print("Sample original labels:", y_train_orig[:5])
print("Sample adjusted labels:", y_train[:5])

Number of unique classes: 5
Labels adjusted to 0-based indexing.
Sample original labels: [5 5 2 1 4]
Sample adjusted labels: [4 4 1 0 3]


# Define BiLSTM Model Architecture

In [None]:
# --- Build the BiLSTM Model ---

model = Sequential()

# Embedding Layer - Initialized with GloVe, not trainable
model.add(Embedding(input_dim=num_words, # Size of the vocabulary
                    output_dim=EMBEDDING_DIM, # Dimension of the dense embedding
                    weights=[embedding_matrix], # Pre-trained GloVe weights
                    input_length=MAX_SEQUENCE_LENGTH, # Length of input sequences
                    trainable=False)) # Freeze GloVe weights

# BiLSTM Layer 1
model.add(Bidirectional(LSTM(100, return_sequences=True))) # 100 neurons, return sequences for the next LSTM layer
model.add(Dropout(0.1)) # Dropout rate 0.1

# BiLSTM Layer 2
model.add(Bidirectional(LSTM(200))) # 200 neurons, default return_sequences=False
model.add(Dropout(0.1)) # Dropout rate 0.1

# Output Layer
# Using num_classes (5) instead of the requested 9, as it aligns with 1-5 star ratings.
model.add(Dense(num_classes, activation='softmax')) # Output layer for classification

# Print model summary
model.summary()



# Compile the Model

In [None]:
# --- Compile the Model ---

# Define the optimizer with specified parameters
# Note: 'decay' is deprecated in newer Keras versions for Adam.
# Instead, use a learning rate schedule or adjust the learning rate directly.
# For simplicity here, we'll use the learning rate as specified.
# If you are using an older TF/Keras version where decay works in Adam, you can add it:
# optimizer = Adam(learning_rate=1e-04, decay=0.01)
# In newer versions (TF 2.3+), decay is often handled via schedules. Let's just set the LR.
optimizer = Adam(learning_rate=1e-04)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', # Use sparse CE because labels are integers (0-4)
              optimizer=optimizer,
              metrics=['accuracy'])

print("Model compiled successfully.")

Model compiled successfully.


# Train the Model

In [None]:
# --- Train the Model ---

BATCH_SIZE = 200
EPOCHS = 10

print("Starting model training...")
history = model.fit(X_train_padded, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_test_padded, y_test),
                    verbose=1) # Set verbose=1 or 2 to see progress per epoch

print("Model training completed.")

Starting model training...
Epoch 1/10
[1m1976/1976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 103ms/step - accuracy: 0.6771 - loss: 0.9291 - val_accuracy: 0.7425 - val_loss: 0.7196
Epoch 2/10
[1m1976/1976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 106ms/step - accuracy: 0.7471 - loss: 0.7045 - val_accuracy: 0.7566 - val_loss: 0.6779
Epoch 3/10
[1m1976/1976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 104ms/step - accuracy: 0.7577 - loss: 0.6710 - val_accuracy: 0.7615 - val_loss: 0.6574
Epoch 4/10
[1m1976/1976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 104ms/step - accuracy: 0.7656 - loss: 0.6472 - val_accuracy: 0.7631 - val_loss: 0.6564
Epoch 5/10
[1m1976/1976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 106ms/step - accuracy: 0.7698 - loss: 0.6328 - val_accuracy: 0.7668 - val_loss: 0.6439
Epoch 6/10
[1m1976/1976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 106ms/step - accuracy: 0.7713 - loss: 0.6272 - val_a

# Evaluate the Model

In [None]:
# --- Evaluate the Model ---

print("\nEvaluating model on the test set...")
loss, accuracy = model.evaluate(X_test_padded, y_test, batch_size=BATCH_SIZE, verbose=0)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

# --- Get Predictions and Detailed Report ---
print("\nGenerating predictions and classification report...")
y_pred_probs = model.predict(X_test_padded, batch_size=BATCH_SIZE)
y_pred_classes = np.argmax(y_pred_probs, axis=1) # Get the class with highest probability

# Remember: y_test is 0-4, y_pred_classes is 0-4.
# If you want the report with original labels (1-5), add 1 back.
target_names = [str(i) for i in range(1, num_classes + 1)] # Labels '1' through '5'

print('\nClassification Report:')
print(classification_report(y_test, y_pred_classes, target_names=target_names))

print('\nConfusion Matrix:')
# Displaying matrix with 0-4 labels for direct index mapping
print(confusion_matrix(y_test, y_pred_classes))

# Calculate accuracy using sklearn (should match model.evaluate)
sklearn_accuracy = accuracy_score(y_test, y_pred_classes)
print(f'\nSklearn Accuracy Score: {sklearn_accuracy:.4f}')

# Convert back to original rating scale (1-5) if needed for interpretation
# y_pred_classes_orig_scale = y_pred_classes + 1
# y_test_orig_scale = y_test + 1
# print(confusion_matrix(y_test_orig_scale, y_pred_classes_orig_scale)) # CM with 1-5 labels


Evaluating model on the test set...
Test Loss: 0.6209
Test Accuracy: 0.7730

Generating predictions and classification report...
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 35ms/step

Classification Report:
              precision    recall  f1-score   support

           1       0.67      0.83      0.74     13909
           2       0.58      0.19      0.28      5779
           3       0.49      0.36      0.42      7387
           4       0.61      0.31      0.41     11395
           5       0.84      0.95      0.89     60294

    accuracy                           0.77     98764
   macro avg       0.64      0.53      0.55     98764
weighted avg       0.75      0.77      0.75     98764


Confusion Matrix:
[[11570   314   544   117  1364]
 [ 2651  1071   856   207   994]
 [ 1492   295  2655   853  2092]
 [  501   103   810  3571  6410]
 [ 1096    63   509  1145 57481]]

Sklearn Accuracy Score: 0.7730
