In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Health_and_Personal_Care.jsonl'

# Load the .jsonl file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the first 5 rows of the DataFrame
print("DataFrame Loaded Successfully!")
df.head()


DataFrame Loaded Successfully!


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True


In [3]:
# number of reviews
print(len(df))

print(list(df.columns))

494121
['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


In [4]:
# count of all ratings to check if there's imbalance
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,301713
1,69564
4,57000
3,36949
2,28895


In [5]:
df.isnull().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


In [6]:
from google.colab import drive
import pandas as pd
import numpy as np
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Preprocessing

In [7]:
df.dropna(subset=['text', 'rating'], inplace=True)
print(f"\nDataFrame shape after dropping rows with missing text/rating: {df.shape}")

# Combine 'title' and 'text'
# Fill missing titles with an empty string BEFORE concatenation
df['title'] = df['title'].fillna('')
df['review_full'] = df['title'] + ' ' + df['text']

# Text Cleaning Setup
nltk.download('punkt_tab', quiet=True) # Download the specific resource needed
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Lowercasing
    text = text.lower()
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # 3. Remove punctuation and special characters (keeping only letters and whitespace)
    text = re.sub(r'[^a-z\s]', '', text)
    # 4. Tokenization
    tokens = word_tokenize(text)
    # 5. Remove Stop Words and Lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1] # Keep words with length > 1
    # 6. Join back into string
    return ' '.join(cleaned_tokens)

print("\nStarting text cleaning (this may take a while)...")
# Apply cleaning function - Monitor progress if needed for large datasets
# Consider df['review_full'].parallel_apply(clean_text) using libraries like pandarallel if speed is critical
df['review_cleaned'] = df['review_full'].apply(clean_text)
print("Text cleaning completed.")

# Drop rows where cleaning might have resulted in empty strings
df.dropna(subset=['review_cleaned'], inplace=True)
df = df[df['review_cleaned'].str.strip() != ''] # Ensure no empty strings after cleaning
print(f"DataFrame shape after cleaning and removing empty reviews: {df.shape}")


DataFrame shape after dropping rows with missing text/rating: (494121, 10)

Starting text cleaning (this may take a while)...
Text cleaning completed.
DataFrame shape after cleaning and removing empty reviews: (493820, 12)


In [8]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,review_full,review_cleaned
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True,12 mg is 12 on the periodic table people! Mg f...,mg periodic table people mg magnesium review c...
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True,Save the lanet using less plastic. Love these ...,save lanet using less plastic love easy multit...
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True,Fantastic I have been suffering a couple month...,fantastic suffering couple month heel pain pla...
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True,It holds the water and makes bubbles. That's ...,hold water make bubble thats bought cheap want...
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True,Not for me Didn't do a thing for me. Not sayin...,didnt thing saying dont


# Download GloVe embeddings if not already present

In [9]:
# Download and unzip GloVe if not already present
if not os.path.exists('glove.6B.zip'):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

--2025-04-12 13:59:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-04-12 13:59:44--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-04-12 13:59:44--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

# Load the GloVe embeddings into a dictionary

In [10]:
def load_glove_embeddings(glove_file_path):
    """
    Loads GloVe embeddings from a file into a dictionary.
    Returns a dict mapping 'word' -> embedding (as a NumPy array).
    """
    embeddings_dict = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

# Load GloVe 300d
glove_file = 'glove.6B.300d.txt'  # adjust if you want a different dimension
embeddings_index = load_glove_embeddings(glove_file)
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")


Loaded 400000 word vectors from GloVe.


# Define a function to get the average embedding of each review

In [11]:
from nltk.tokenize import word_tokenize

def get_average_embedding(text, embeddings_dict, embedding_dim=300):
    """
    Compute the average word embedding for the given text.
    """
    tokens = word_tokenize(text.lower())
    valid_vectors = []

    for token in tokens:
        if token in embeddings_dict:  # If the token has a GloVe embedding
            valid_vectors.append(embeddings_dict[token])

    if len(valid_vectors) > 0:
        # Average of all word embeddings in the review
        return np.mean(valid_vectors, axis=0)
    else:
        # If none of the tokens have embeddings, return a zero vector
        return np.zeros(embedding_dim, dtype='float32')

def convert_texts_to_embeddings(texts, embeddings_dict, embedding_dim=300):
    """
    Converts a list/array of texts into a 2D NumPy array of shape (num_texts, embedding_dim).
    """
    embeddings = []
    for text in texts:
        emb = get_average_embedding(text, embeddings_dict, embedding_dim)
        embeddings.append(emb)
    return np.array(embeddings, dtype='float32')


# Create features (X) and labels (y), then split into train/test sets

In [12]:
# Extract features and labels
X_texts = df['review_cleaned'].values  # the preprocessed reviews
y = df['rating'].values       # the corresponding labels (sentiment or classes)

# Split the data
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X_texts,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Number of training samples:", len(X_train_texts))
print("Number of testing samples:", len(X_test_texts))


Number of training samples: 395056
Number of testing samples: 98764


# Convert train/test reviews to embeddings

In [13]:
# Convert text data to embedding vectors
X_train_embeddings = convert_texts_to_embeddings(X_train_texts, embeddings_index, 300)
X_test_embeddings  = convert_texts_to_embeddings(X_test_texts,  embeddings_index, 300)

print("X_train_embeddings shape:", X_train_embeddings.shape)
print("X_test_embeddings shape:", X_test_embeddings.shape)


X_train_embeddings shape: (395056, 300)
X_test_embeddings shape: (98764, 300)


# Train an LR model

In [15]:
from sklearn.linear_model import LogisticRegression
import time
from tqdm.notebook import tqdm

# Check if embeddings are available before training
if X_train_embeddings is not None and y_train is not None:
    # Define the Logistic Regression model with improved parameters for large datasets
    lr_model = LogisticRegression(
        C=10,                # Regularization strength (use 'l2' penalty by default for better performance on large datasets)
        penalty='l2',        # Use 'l2' regularization which is more stable and scalable for large datasets
        max_iter=1000,       # Maximum iterations for solver to converge
        solver='saga',       # 'saga' is efficient for large datasets and sparse matrices
        multi_class='auto',  # Automatically handle multiclass classification
        random_state=42,     # For reproducibility
        verbose=1,           # Show progress information from the solver
        n_jobs=-1            # Use all available CPU cores to speed up computation
    )

    print("Starting Logistic Regression model training...")
    start_time = time.time()

    # Train the model
    lr_model.fit(X_train_embeddings, y_train)

    end_time = time.time()
    training_time = end_time - start_time
    print(f"\nLogistic Regression training completed in {training_time:.2f} seconds.")

else:
    print("Cannot train model: Embeddings or labels are missing.")


Starting Logistic Regression model training...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 50 epochs took 258 seconds

Logistic Regression training completed in 258.63 seconds.


# Evaluate the model

In [16]:
# Evaluate the Model
if 'lr_model' in locals() and X_test_embeddings is not None and y_test is not None:
    print("\nEvaluating the Logistic Regression model...")

    # Predict on the test set
    y_pred = lr_model.predict(X_test_embeddings)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, zero_division=0) # Added zero_division
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(class_report)
    print("\nConfusion Matrix:")
    print(conf_matrix)

else:
    print("\nCannot evaluate model: Model not trained or test data missing.")


Evaluating the Logistic Regression model...

Accuracy: 0.7080

Classification Report:
              precision    recall  f1-score   support

           1       0.60      0.66      0.63     13909
           2       0.53      0.12      0.19      5779
           3       0.42      0.14      0.21      7387
           4       0.54      0.12      0.20     11395
           5       0.75      0.95      0.84     60294

    accuracy                           0.71     98764
   macro avg       0.57      0.40      0.41     98764
weighted avg       0.67      0.71      0.65     98764


Confusion Matrix:
[[ 9224   255   336   162  3932]
 [ 2120   672   332   198  2457]
 [ 1542   183  1042   302  4318]
 [  798    60   389  1408  8740]
 [ 1679    95   386   555 57579]]
