In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Health_and_Personal_Care.jsonl'

# Load the .jsonl file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the first 5 rows of the DataFrame
print("DataFrame Loaded Successfully!")
df.head()


DataFrame Loaded Successfully!


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True


In [3]:
# number of reviews
print(len(df))

print(list(df.columns))

494121
['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


In [4]:
# count of all ratings to check if there's imbalance
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,301713
1,69564
4,57000
3,36949
2,28895


In [5]:
df.isnull().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


In [6]:
from google.colab import drive
import pandas as pd
import numpy as np
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Preprocessing

In [7]:
df.dropna(subset=['text', 'rating'], inplace=True)
print(f"\nDataFrame shape after dropping rows with missing text/rating: {df.shape}")

# Combine 'title' and 'text'
# Fill missing titles with an empty string BEFORE concatenation
df['title'] = df['title'].fillna('')
df['review_full'] = df['title'] + ' ' + df['text']

# Text Cleaning Setup
nltk.download('punkt_tab', quiet=True) # Download the specific resource needed
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Lowercasing
    text = text.lower()
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # 3. Remove punctuation and special characters (keeping only letters and whitespace)
    text = re.sub(r'[^a-z\s]', '', text)
    # 4. Tokenization
    tokens = word_tokenize(text)
    # 5. Remove Stop Words and Lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1] # Keep words with length > 1
    # 6. Join back into string
    return ' '.join(cleaned_tokens)

print("\nStarting text cleaning (this may take a while)...")
# Apply cleaning function - Monitor progress if needed for large datasets
# Consider df['review_full'].parallel_apply(clean_text) using libraries like pandarallel if speed is critical
df['review_cleaned'] = df['review_full'].apply(clean_text)
print("Text cleaning completed.")

# Drop rows where cleaning might have resulted in empty strings
df.dropna(subset=['review_cleaned'], inplace=True)
df = df[df['review_cleaned'].str.strip() != ''] # Ensure no empty strings after cleaning
print(f"DataFrame shape after cleaning and removing empty reviews: {df.shape}")


DataFrame shape after dropping rows with missing text/rating: (494121, 10)

Starting text cleaning (this may take a while)...
Text cleaning completed.
DataFrame shape after cleaning and removing empty reviews: (493820, 12)


In [8]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,review_full,review_cleaned
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True,12 mg is 12 on the periodic table people! Mg f...,mg periodic table people mg magnesium review c...
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True,Save the lanet using less plastic. Love these ...,save lanet using less plastic love easy multit...
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True,Fantastic I have been suffering a couple month...,fantastic suffering couple month heel pain pla...
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True,It holds the water and makes bubbles. That's ...,hold water make bubble thats bought cheap want...
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True,Not for me Didn't do a thing for me. Not sayin...,didnt thing saying dont


# Download GloVe embeddings if not already present

In [9]:
# Download and unzip GloVe if not already present
if not os.path.exists('glove.6B.zip'):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

--2025-04-12 11:25:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-04-12 11:25:13--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-04-12 11:25:13--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

# Load the GloVe embeddings into a dictionary

In [25]:
def load_glove_embeddings(glove_file_path):
    """
    Loads GloVe embeddings from a file into a dictionary.
    Returns a dict mapping 'word' -> embedding (as a NumPy array).
    """
    embeddings_dict = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

# Load GloVe 300d
glove_file = 'glove.6B.300d.txt'  # adjust if you want a different dimension
embeddings_index = load_glove_embeddings(glove_file)
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")


Loaded 400000 word vectors from GloVe.


# Define a function to get the average embedding of each review

In [26]:
from nltk.tokenize import word_tokenize

def get_average_embedding(text, embeddings_dict, embedding_dim=300):
    """
    Compute the average word embedding for the given text.
    """
    tokens = word_tokenize(text.lower())
    valid_vectors = []

    for token in tokens:
        if token in embeddings_dict:  # If the token has a GloVe embedding
            valid_vectors.append(embeddings_dict[token])

    if len(valid_vectors) > 0:
        # Average of all word embeddings in the review
        return np.mean(valid_vectors, axis=0)
    else:
        # If none of the tokens have embeddings, return a zero vector
        return np.zeros(embedding_dim, dtype='float32')

def convert_texts_to_embeddings(texts, embeddings_dict, embedding_dim=300):
    """
    Converts a list/array of texts into a 2D NumPy array of shape (num_texts, embedding_dim).
    """
    embeddings = []
    for text in texts:
        emb = get_average_embedding(text, embeddings_dict, embedding_dim)
        embeddings.append(emb)
    return np.array(embeddings, dtype='float32')


# Create features (X) and labels (y), then split into train/test sets

In [27]:
# Extract features and labels
X_texts = df['review_cleaned'].values  # the preprocessed reviews
y = df['rating'].values       # the corresponding labels (sentiment or classes)

# Split the data
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X_texts,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Number of training samples:", len(X_train_texts))
print("Number of testing samples:", len(X_test_texts))


Number of training samples: 395056
Number of testing samples: 98764


# Convert train/test reviews to embeddings

In [28]:
# Convert text data to embedding vectors
X_train_embeddings = convert_texts_to_embeddings(X_train_texts, embeddings_index, 300)
X_test_embeddings  = convert_texts_to_embeddings(X_test_texts,  embeddings_index, 300)

print("X_train_embeddings shape:", X_train_embeddings.shape)
print("X_test_embeddings shape:", X_test_embeddings.shape)


X_train_embeddings shape: (395056, 300)
X_test_embeddings shape: (98764, 300)


# Train an RF model

In [29]:
# !pip install tqdm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib
# from tqdm.auto import tqdm # You likely won't use tqdm directly here

def build_random_forest_model(n_estimators=100, max_depth=None, random_state=42, verbose_level=1): # Added verbose_level
    """
    Creates and returns a Random Forest classifier with some
    general-purpose hyperparameters for speed and performance.
    Includes verbosity setting.
    """
    print(f"Building RandomForestClassifier with n_estimators={n_estimators}, max_depth={max_depth}, verbose={verbose_level}")
    return RandomForestClassifier(
        n_estimators=n_estimators,  # Number of trees in the forest
        criterion='gini',           # Function to measure the quality of a split
        max_depth=max_depth,        # Expand nodes until leaves are pure (or max_depth is reached)
        random_state=random_state,  # For reproducibility
        n_jobs=-1,                  # Use all available CPU cores
        class_weight='balanced',
        verbose=verbose_level
    )


def train_and_save_model(X_train_embeddings, y_train, model_path="rf_model.pkl"):
    """
    Builds, trains, and saves a Random Forest model.

    Args:
        X_train_embeddings (np.ndarray): Training data features (embeddings).
        y_train (np.ndarray): Training labels.
        model_path (str): File path to save the trained model.
    """
    # Set verbose level here if you want (e.g., verbose_level=2 for more output)
    rf_model = build_random_forest_model(verbose_level=1)

    print("\nStarting Random Forest training (verbose output below)...") # Added newline for clarity
    rf_model.fit(X_train_embeddings, y_train)
    print("Training finished.")

    # Ensure joblib dump is uncommented if you want to save
    joblib.dump(rf_model, model_path)
    print(f"Random Forest model trained and saved to: {model_path}")

In [30]:
# Define the path where you want to save the model
model_save_path = "/content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Saurav/Main_Assignment/Models Saved/rf_glove_model.pkl"

# Call the training function using the embeddings and labels you prepared
train_and_save_model(X_train_embeddings, y_train, model_path=model_save_path)

Building RandomForestClassifier with n_estimators=100, max_depth=None, verbose=1

Starting Random Forest training (verbose output below)...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 24.9min finished


Training finished.
Random Forest model trained and saved to: /content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Saurav/Main_Assignment/Models Saved/rf_glove_model.pkl


# Evaluate the model

In [31]:
import numpy as np
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def load_and_evaluate_model(X_test, y_test, model_path="rf_model.pkl"):
    """
    Loads a trained Random Forest model, predicts on the test set,
    and prints out evaluation metrics.

    Args:
        X_test (np.ndarray): Test data features (embeddings).
        y_test (np.ndarray): Test labels.
        model_path (str): Path to the saved model file (.pkl).
    """
    # 1. Load the trained model
    rf_model = joblib.load(model_load_path)
    print(f"Model loaded from: {model_load_path}")

    # 2. Predict on test data
    y_pred = rf_model.predict(X_test)

    # 3. Print evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    print("\nAccuracy:", accuracy)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Ensure you use the same path used for saving the model
model_load_path = "/content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Saurav/Main_Assignment/Models Saved/rf_glove_model.pkl"

# Call the evaluation function using the test embeddings and labels
load_and_evaluate_model(X_test_embeddings, y_test, model_path=model_load_path)

Model loaded from: /content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Saurav/Main_Assignment/Models Saved/rf_glove_model.pkl


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    9.3s finished



Accuracy: 0.6668725446518975

Classification Report:
              precision    recall  f1-score   support

           1       0.76      0.36      0.49     13909
           2       0.77      0.04      0.07      5779
           3       0.70      0.04      0.08      7387
           4       0.72      0.06      0.11     11395
           5       0.66      0.99      0.79     60294

    accuracy                           0.67     98764
   macro avg       0.72      0.30      0.31     98764
weighted avg       0.69      0.67      0.57     98764

Confusion Matrix:
[[ 5001    25    10     9  8864]
 [  713   227    22     4  4813]
 [  377    15   299    26  6670]
 [  137     2    33   698 10525]
 [  337    24    61   234 59638]]
