In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Replace with your actual file path
file_path = '/content/drive/MyDrive/DADM/Main_Assignment_Shared_resources/Mervin/Main_Assignment/Health_and_Personal_Care.jsonl'

# Load the .jsonl file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the first 5 rows of the DataFrame
print("DataFrame Loaded Successfully!")
df.head()


DataFrame Loaded Successfully!


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True


In [None]:
# number of reviews
print(len(df))

print(list(df.columns))

494121
['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


In [None]:
# count of all ratings to check if there's imbalance
df['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,301713
1,69564
4,57000
3,36949
2,28895


There seems to be a class imbalance. 5 star reviews seem to dominate a lot.

In [None]:
df.isnull().sum()

Unnamed: 0,0
rating,0
title,0
text,0
images,0
asin,0
parent_asin,0
user_id,0
timestamp,0
helpful_vote,0
verified_purchase,0


There doesn't seem to be any null values.

In [3]:
!pip install lightgbm -q # Install lightgbm quietly
import lightgbm
print(f"LightGBM version: {lightgbm.__version__}")

LightGBM version: 4.5.0


In [5]:
from google.colab import drive
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Preprocessing

In [None]:
df.dropna(subset=['text', 'rating'], inplace=True)
print(f"\nDataFrame shape after dropping rows with missing text/rating: {df.shape}")

# Combine 'title' and 'text'
# Fill missing titles with an empty string BEFORE concatenation
df['title'] = df['title'].fillna('')
df['review_full'] = df['title'] + ' ' + df['text']

# Text Cleaning Setup
nltk.download('punkt_tab', quiet=True) # Download the specific resource needed
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Lowercasing
    text = text.lower()
    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)
    # 3. Remove punctuation and special characters (keeping only letters and whitespace)
    text = re.sub(r'[^a-z\s]', '', text)
    # 4. Tokenization
    tokens = word_tokenize(text)
    # 5. Remove Stop Words and Lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1] # Keep words with length > 1
    # 6. Join back into string
    return ' '.join(cleaned_tokens)

print("\nStarting text cleaning (this may take a while)...")
# Apply cleaning function - Monitor progress if needed for large datasets
# Consider df['review_full'].parallel_apply(clean_text) using libraries like pandarallel if speed is critical
df['review_cleaned'] = df['review_full'].apply(clean_text)
print("Text cleaning completed.")

# Drop rows where cleaning might have resulted in empty strings
df.dropna(subset=['review_cleaned'], inplace=True)
df = df[df['review_cleaned'].str.strip() != ''] # Ensure no empty strings after cleaning
print(f"DataFrame shape after cleaning and removing empty reviews: {df.shape}")


DataFrame shape after dropping rows with missing text/rating: (494121, 11)

Starting text cleaning (this may take a while)...
Text cleaning completed.
DataFrame shape after cleaning and removing empty reviews: (493820, 12)


In [None]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,review_full,review_cleaned
0,4,12 mg is 12 on the periodic table people! Mg f...,This review is more to clarify someone else’s ...,[],B07TDSJZMR,B07TDSJZMR,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-02-06 00:49:35.902,3,True,12 mg is 12 on the periodic table people! Mg f...,mg periodic table people mg magnesium review c...
1,5,Save the lanet using less plastic.,Love these easy multitasking bleach tablets. B...,[],B08637FWWF,B08637FWWF,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:03:06.880,3,True,Save the lanet using less plastic. Love these ...,save lanet using less plastic love easy multit...
2,5,Fantastic,I have been suffering a couple months with hee...,[],B07KJVGNN5,B07KJVGNN5,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2019-07-24 11:13:58.905,0,True,Fantastic I have been suffering a couple month...,fantastic suffering couple month heel pain pla...
3,4,It holds the water and makes bubbles. That's ...,"It's cheap and it does what I wanted. The ""ma...",[],B007HY7GC2,B092RP73CX,AEZGPLOYTSAPR3DHZKKXEFPAXUAA,2022-09-04 02:29:02.725,7,True,It holds the water and makes bubbles. That's ...,hold water make bubble thats bought cheap want...
4,1,Not for me,Didn't do a thing for me. Not saying they don'...,[],B08KYJLF5T,B08KYJLF5T,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,2022-01-20 23:53:07.262,0,True,Not for me Didn't do a thing for me. Not sayin...,didnt thing saying dont


Saving the processed data frame in a file (pickle, parquet and csv) as a checkpoints.

In [None]:
# saving checkpoint
df.to_pickle('/content/drive/MyDrive/DADM/Main_Assignment_Shared_resources/Mervin/Main_Assignment/reviews_with_full_column.pkl')

NameError: name 'df' is not defined

In [None]:
df.to_parquet('/content/drive/MyDrive/DADM/Main_Assignment_Shared_resources/Mervin/Main_Assignment/reviews_with_full_column.parquet')

In [None]:
df.to_csv('/content/drive/MyDrive/DADM/Main_Assignment_Shared_resources/Mervin/Main_Assignment/reviews_with_full_column.csv', index=False)

#Load from this checkpoint if saved already

In [6]:
import pandas as pd

# loading checkpoint
df = pd.read_pickle('/content/drive/MyDrive/DADM/Main_Assignment_Shared_resources/Mervin/Main_Assignment/reviews_with_full_column.pkl')

# --- 3. Feature Engineering (TF-IDF) ---

In [7]:
# Define Features (X) and Target (y)
X = df['review_cleaned']
y = df['rating']

# Initialize TF-IDF Vectorizer
# You can experiment with parameters like max_features, ngram_range etc. later
# max_features=5000 limits the vocabulary size, can speed up training and reduce memory
# tfidf_vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.7) # Example parameters
tfidf_vectorizer = TfidfVectorizer() # Example parameters
print("\nInitializing TF-IDF Vectorizer...")




Initializing TF-IDF Vectorizer...


# --- 4. Data Splitting ---

In [8]:
print("Splitting data into Training and Testing sets (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,    # 20% for testing
    random_state=42,  # for reproducibility
    stratify=y        # IMPORTANT: keep class distribution same in train/test due to imbalance
)
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")


Splitting data into Training and Testing sets (80/20)...
Training set size: 395056 samples
Testing set size: 98764 samples


# --- 5. Apply TF-IDF & Model Training ---

In [9]:
# Fit TF-IDF on Training data and Transform Training data
print("\nApplying TF-IDF to training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
print(f"TF-IDF applied to training data. Shape: {X_train_tfidf.shape}")

# Transform Test data using the *same* fitted vectorizer
print("Applying TF-IDF to testing data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print(f"TF-IDF applied to testing data. Shape: {X_test_tfidf.shape}")

# Train LightGBM Classifier with Class Weighting
print("\nTraining LightGBM model with class_weight='balanced'...")
# Initialize the LGBMClassifier
lgbm_model = LGBMClassifier(
    class_weight='balanced', # Handles imbalance
    random_state=42,         # For reproducibility
    n_jobs=-1                # Use all available CPU cores
    # You can add other parameters like n_estimators, learning_rate, etc. for tuning if needed
    # e.g., n_estimators=200, learning_rate=0.1
)

# Fit the model on the TF-IDF transformed training data
lgbm_model.fit(X_train_tfidf, y_train)
print("Model training completed.")


Applying TF-IDF to training data...
TF-IDF applied to training data. Shape: (395056, 149434)
Applying TF-IDF to testing data...
TF-IDF applied to testing data. Shape: (98764, 149434)

Training LightGBM model with class_weight='balanced'...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 87.670187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 522837
[LightGBM] [Info] Number of data points in the train set: 395056, number of used features: 11302
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
Model training completed.


# --- 6. Model Evaluation ---

In [10]:
print("\nEvaluating model performance on the Test set...")

# Predict on the test set using the trained LightGBM model
y_pred = lgbm_model.predict(X_test_tfidf) # Use the lgbm_model here

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Accuracy: {accuracy:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[str(i) for i in sorted(y.unique())], zero_division=0))

# Confusion Matrix
print("\nConfusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                   index=[f'Actual_{i}' for i in sorted(y.unique())],
                   columns=[f'Predicted_{i}' for i in sorted(y.unique())]))

print("\n--- Script Finished ---")


Evaluating model performance on the Test set...





Overall Accuracy: 0.6591

Classification Report:
              precision    recall  f1-score   support

           1       0.64      0.70      0.67     13909
           2       0.26      0.42      0.32      5779
           3       0.37      0.40      0.39      7387
           4       0.34      0.54      0.41     11395
           5       0.92      0.73      0.81     60294

    accuracy                           0.66     98764
   macro avg       0.50      0.56      0.52     98764
weighted avg       0.73      0.66      0.68     98764


Confusion Matrix:
          Predicted_1  Predicted_2  Predicted_3  Predicted_4  Predicted_5
Actual_1         9717         2494          783          502          413
Actual_2         1535         2417         1019          518          290
Actual_3          849         1415         2979         1503          641
Actual_4          549          764         1381         6131         2570
Actual_5         2539         2381         1916         9605        4385