In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Data Mining/Main_Assignment_Shared_resources/Health_and_Personal_Care.jsonl'

# Load the .jsonl file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the first 5 rows of the DataFrame
print("DataFrame Loaded Successfully!")
df.head()


DataFrame Loaded Successfully!


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True


In [3]:
# number of reviews
print(len(df))

print(list(df.columns))

494121
['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


In [4]:
# count of all ratings to check if there's imbalance
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,301713
1,69564
4,57000
3,36949
2,28895


In [5]:
df.isnull().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


In [6]:
from google.colab import drive
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC                           # <-- Import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import time

# Preprocessing

In [7]:
df.dropna(subset=['text', 'rating'], inplace=True)
print(f"\nDataFrame shape after dropping rows with missing text/rating: {df.shape}")

# Combine 'title' and 'text'
# Fill missing titles with an empty string BEFORE concatenation
df['title'] = df['title'].fillna('')
df['review_full'] = df['title'] + ' ' + df['text']

# Text Cleaning Setup
nltk.download('punkt_tab', quiet=True) # Download the specific resource needed
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Lowercasing
    text = text.lower()
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # 3. Remove punctuation and special characters (keeping only letters and whitespace)
    text = re.sub(r'[^a-z\s]', '', text)
    # 4. Tokenization
    tokens = word_tokenize(text)
    # 5. Remove Stop Words and Lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1] # Keep words with length > 1
    # 6. Join back into string
    return ' '.join(cleaned_tokens)

print("\nStarting text cleaning (this may take a while)...")
# Apply cleaning function - Monitor progress if needed for large datasets
# Consider df['review_full'].parallel_apply(clean_text) using libraries like pandarallel if speed is critical
df['review_cleaned'] = df['review_full'].apply(clean_text)
print("Text cleaning completed.")

# Drop rows where cleaning might have resulted in empty strings
df.dropna(subset=['review_cleaned'], inplace=True)
df = df[df['review_cleaned'].str.strip() != ''] # Ensure no empty strings after cleaning
print(f"DataFrame shape after cleaning and removing empty reviews: {df.shape}")


DataFrame shape after dropping rows with missing text/rating: (494121, 10)

Starting text cleaning (this may take a while)...
Text cleaning completed.
DataFrame shape after cleaning and removing empty reviews: (493820, 12)


In [8]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,review_full,review_cleaned
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True,12 mg is 12 on the periodic table people! Mg f...,mg periodic table people mg magnesium review c...
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True,Save the lanet using less plastic. Love these ...,save lanet using less plastic love easy multit...
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True,Fantastic I have been suffering a couple month...,fantastic suffering couple month heel pain pla...
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True,It holds the water and makes bubbles. That's ...,hold water make bubble thats bought cheap want...
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True,Not for me Didn't do a thing for me. Not sayin...,didnt thing saying dont


In [9]:
!pip install numpy pandas scikit-learn nltk

import numpy as np
import pandas as pd
import nltk
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK data (if you haven't already)
nltk.download('punkt')  # For tokenization




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Download GloVe embeddings if not already present

In [10]:
# Download and unzip GloVe if not already present
if not os.path.exists('glove.6B.zip'):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

--2025-04-12 12:41:58--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-04-12 12:41:58--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-04-12 12:41:58--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

# Load the GloVe embeddings into a dictionary

In [11]:
def load_glove_embeddings(glove_file_path):
    """
    Loads GloVe embeddings from a file into a dictionary.
    Returns a dict mapping 'word' -> embedding (as a NumPy array).
    """
    embeddings_dict = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

# Load GloVe 100d
glove_file = 'glove.6B.300d.txt'  # adjust if you want a different dimension
embeddings_index = load_glove_embeddings(glove_file)
print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")


Loaded 400000 word vectors from GloVe.


# Define a function to get the average embedding of each review

In [12]:
from nltk.tokenize import word_tokenize

def get_average_embedding(text, embeddings_dict, embedding_dim=300):
    """
    Compute the average word embedding for the given text.
    """
    tokens = word_tokenize(text.lower())
    valid_vectors = []

    for token in tokens:
        if token in embeddings_dict:  # If the token has a GloVe embedding
            valid_vectors.append(embeddings_dict[token])

    if len(valid_vectors) > 0:
        # Average of all word embeddings in the review
        return np.mean(valid_vectors, axis=0)
    else:
        # If none of the tokens have embeddings, return a zero vector
        return np.zeros(embedding_dim, dtype='float32')

def convert_texts_to_embeddings(texts, embeddings_dict, embedding_dim=300):
    """
    Converts a list/array of texts into a 2D NumPy array of shape (num_texts, embedding_dim).
    """
    embeddings = []
    for text in texts:
        emb = get_average_embedding(text, embeddings_dict, embedding_dim)
        embeddings.append(emb)
    return np.array(embeddings, dtype='float32')


# Create features (X) and labels (y), then split into train/test sets

In [13]:
# Extract features and labels
X_texts = df['review_cleaned'].values  # the preprocessed reviews
y = df['rating'].values       # the corresponding labels (sentiment or classes)

# Split the data
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X_texts,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Number of training samples:", len(X_train_texts))
print("Number of testing samples:", len(X_test_texts))


Number of training samples: 395056
Number of testing samples: 98764


# Convert train/test reviews to embeddings

In [14]:
# Convert text data to embedding vectors
X_train_embeddings = convert_texts_to_embeddings(X_train_texts, embeddings_index, 300)
X_test_embeddings  = convert_texts_to_embeddings(X_test_texts,  embeddings_index, 300)

print("X_train_embeddings shape:", X_train_embeddings.shape)
print("X_test_embeddings shape:", X_test_embeddings.shape)


X_train_embeddings shape: (395056, 300)
X_test_embeddings shape: (98764, 300)


# Train an SVM model

In [15]:
# Necessary Imports (ensure these are imported earlier in your script)
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler # IMPORTANT: Scaling is highly recommended!
import time
import numpy as np # Assuming X_train_embeddings is a numpy array

# --- Pre-computation Check (Highly Recommended) ---
# LinearSVC (like SVC) is sensitive to feature scaling.
# Ensure your X_train_embeddings ARE SCALED before fitting.
# If not, uncomment and run this block FIRST:

# --- Training with LinearSVC ---

print("Initializing and training LinearSVC classifier...")
start_time = time.time()

# Create and train the LinearSVC classifier
# Use scaled data if you performed scaling above
# Replace X_train_embeddings with X_train_embeddings_scaled if you scaled
linear_svm_model = LinearSVC(
    class_weight='balanced',  # Keep this parameter
    random_state=42,          # Keep this parameter
    C=1.0,                    # Default C, same as SVC default. Tune if needed.
    max_iter=3000,            # Increase if convergence issues arise (default=1000)
    dual='auto'               # Recommended setting ('auto')
                              # Consider dual=False if n_samples > n_features and speed is critical
)

# FIT THE MODEL using the appropriate data (scaled or original)
# Make sure y_train is also defined and has the correct shape
try:
    # Use scaled data if available and scaling was successful
    if 'X_train_embeddings_scaled' in locals() and X_train_embeddings is not None:
        print("Fitting LinearSVC on SCALED data...")
        linear_svm_model.fit(X_train_embeddings, y_train)
    else:
        print("Fitting LinearSVC on ORIGINAL (unscaled) data...")
        linear_svm_model.fit(X_train_embeddings, y_train)

    end_time = time.time()
    print(f"LinearSVC training complete in {end_time - start_time:.2f} seconds.")

    # Optional: Assign back to the original variable name if needed downstream
    svm_model = linear_svm_model

except Exception as e:
    print(f"An error occurred during LinearSVC fitting: {e}")
    print("Check data shapes, types (ensure numeric features, appropriate labels), and scaling.")

Initializing and training LinearSVC classifier...
Fitting LinearSVC on ORIGINAL (unscaled) data...
LinearSVC training complete in 675.61 seconds.


# Evaluate the model

In [16]:
# Predict on test embeddings
y_pred = svm_model.predict(X_test_embeddings)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.6828095257381233

Classification Report:
              precision    recall  f1-score   support

           1       0.54      0.72      0.62     13909
           2       0.26      0.25      0.26      5779
           3       0.30      0.28      0.29      7387
           4       0.42      0.20      0.27     11395
           5       0.83      0.85      0.84     60294

    accuracy                           0.68     98764
   macro avg       0.47      0.46      0.46     98764
weighted avg       0.67      0.68      0.67     98764

Confusion Matrix:
[[10056  1128   732   301  1692]
 [ 2299  1473   739   274   994]
 [ 1807   914  2063   607  1996]
 [ 1185   697  1457  2303  5753]
 [ 3301  1411  1988  2052 51542]]
