In [None]:
# Install required libraries
!pip install transformers datasets torch pandas numpy scikit-learn spacy matplotlib plotly ipywidgets tqdm

In [None]:
# Download spaCy model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import spacy
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset
import torch
import ipywidgets as widgets
from IPython.display import display
import sys

In [None]:
import json

# Define file paths (adjust to your environment)
metadata_path = '/content/drive/MyDrive/AmazonReviews/meta_Musical_Instruments.jsonl'
reviews_path = '/content/drive/MyDrive/AmazonReviews/Musical_Instruments.jsonl'
output_path = '/content/drive/MyDrive/AmazonReviews/unified.jsonl'

# Target number of unique parent_asins
target_count = 250000

# Data containers
unique_parent_asins = set()
unified_data = {}  # Dictionary keyed by parent_asin

print("Processing metadata to collect up to", target_count, "unique parent_asins...")

# Stream the metadata file line by line
with open(metadata_path, 'r') as meta_file:
    for line in meta_file:
        try:
            record = json.loads(line.strip())
        except json.JSONDecodeError:
            continue  # Skip malformed lines
        parent_asin = record.get('parent_asin')
        if parent_asin and parent_asin not in unique_parent_asins:
            unique_parent_asins.add(parent_asin)
            # Initialize the record with an empty list for reviews
            record['reviews'] = []
            unified_data[parent_asin] = record

            # Print status every 1,000 new unique parent_asins
            if len(unique_parent_asins) % 1000 == 0:
                print(f"Collected {len(unique_parent_asins)} unique parent_asins...")

            # Once we reach the target count, stop processing metadata
            if len(unique_parent_asins) >= target_count:
                break

print("Metadata processing complete. Total unique parent_asins collected:", len(unique_parent_asins))

print("\nProcessing reviews and merging into unified data...")

# Process the reviews file to add reviews only for products in our unified dataset.
reviews_processed = 0
with open(reviews_path, 'r') as reviews_file:
    for line in reviews_file:
        try:
            review = json.loads(line.strip())
        except json.JSONDecodeError:
            continue  # Skip malformed lines

        parent_asin = review.get('parent_asin')
        if parent_asin and parent_asin in unified_data:
            unified_data[parent_asin]['reviews'].append(review)

        reviews_processed += 1
        # Print status update every 10,000 reviews processed.
        if reviews_processed % 10000 == 0:
            print(f"Processed {reviews_processed} reviews...")

print("Finished processing reviews.")

# Save the unified dataset (list of products) into a JSON file
with open(output_path, 'w') as out_file:
    json.dump(list(unified_data.values()), out_file, indent=4)

print(f"\nUnified dataset with {len(unified_data)} products saved to: {output_path}")


In [None]:
import json

# Path to your unified dataset JSON file (the one produced after merging metadata and reviews)
input_path = '/content/drive/MyDrive/AmazonReviews/unified.jsonl'
output_path = '/content/drive/MyDrive/AmazonReviews/filtered_unified_dataset.json'

# Allowed fields for the product metadata
meta_allowed = [
    "main_category", "title", "average_rating", "rating_number",
    "features", "description", "price", "store", "categories", "details",
    "parent_asin", "reviews"
]

# Allowed fields for each review inside the "reviews" list
review_allowed = [
    "rating", "text", "user_id", "helpful_vote", "verified_purchase"
]

def filter_review(review):
    """Return a new dictionary containing only the allowed review fields."""
    return { key: review[key] for key in review_allowed if key in review }

def filter_meta(record):
    """Return a new dictionary for a product record containing only allowed metadata fields.
       Also, filter each review in the reviews list."""
    new_record = { key: record.get(key) for key in meta_allowed }
    # If 'reviews' exists and is a list, filter each review's keys.
    if new_record.get("reviews") is not None and isinstance(new_record["reviews"], list):
        new_record["reviews"] = [ filter_review(r) for r in new_record["reviews"] ]
    return new_record

# Load the unified dataset
with open(input_path, 'r') as f:
    unified_data = json.load(f)

# Process each product record to retain only the needed fields
filtered_data = [ filter_meta(record) for record in unified_data ]

# Save the filtered dataset into a new JSON file
with open(output_path, 'w') as out_f:
    json.dump(filtered_data, out_f, indent=4)

print(f"Filtered dataset saved to: {output_path}")

In [None]:
import json
import re

# Minimal cleaning function: remove URLs, HTML tags, extra whitespace, and lowercase the text.
def minimal_clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)      # Remove HTML tags
    text = re.sub(r"\s+", " ", text)       # Normalize whitespace
    return text.strip().lower()

# Simple sentiment labeling based on rating.
def label_sentiment(rating):
    if rating is None:
        return None
    if rating <= 2.0:
        return "negative"
    elif rating == 3.0:
        return "neutral"
    else:  # rating >= 4.0
        return "positive"

# File paths (adjust these as needed)
input_path = '/content/drive/MyDrive/AmazonReviews/filtered_unified_dataset.json'
output_path = '/content/drive/MyDrive/AmazonReviews/cfu_dataset.json' #clean_filtered_unified dataset

# Load the unified dataset (each record represents a product)
with open(input_path, 'r') as f:
    unified_data = json.load(f)

filtered_data = []
product_record_counter = 1

# Process each product record
for product in unified_data:
    # Extract product-level fields
    main_category = product.get("main_category")
    title = product.get("title")
    parent_asin = product.get("parent_asin")
    average_rating = product.get("average_rating")
    overall_sentiment = label_sentiment(average_rating)  # Compute overall sentiment from average_rating
    rating_number = product.get("rating_number")
    features = product.get("features")
    description = product.get("description")
    details = product.get("details")
    price = product.get("price")
    store = product.get("store")
    categories = product.get("categories")

    # Process the reviews list: for each review, keep only the desired fields.
    reviews = product.get("reviews", [])
    processed_reviews = []
    for review in reviews:
        original_text = review.get("text", "")
        cleaned_text = minimal_clean_text(original_text)
        review_rating = review.get("rating")
        review_sentiment = label_sentiment(review_rating)

        new_review = {
            "rating": review_rating,
            "sentiment": review_sentiment,
            "text": original_text,
            "cleaned_text": cleaned_text,
            "user_id": review.get("user_id"),
            "helpful_vote": review.get("helpful_vote"),
            "verified_purchase": review.get("verified_purchase")
        }
        processed_reviews.append(new_review)

    # Create a new product record with fields in the specified order and include product_record.
    new_product = {
        "product_record": product_record_counter,  # new sequential field
        "main_category": main_category,
        "title": title,
        "parent_asin": parent_asin,
        "average_rating": average_rating,
        "overall_sentiment": overall_sentiment,
        "rating_number": rating_number,
        "features": features,
        "description": description,
        "details": details,
        "price": price,
        "store": store,
        "categories": categories,
        "reviews": processed_reviews
    }

    filtered_data.append(new_product)

    # Print dynamic status update for the processed product.
    print(f"Sample-{product_record_counter} product_record - {product_record_counter} is completed")

    product_record_counter += 1

# Save the filtered dataset to a new JSON file with pretty-printing
with open(output_path, 'w') as out_file:
    json.dump(filtered_data, out_file, indent=4)

print(f"\nFiltered unified dataset saved to: {output_path}")

In [None]:
import json
import spacy
import re

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    """
    Extract named entities from the provided text using spaCy.
    Returns a list of tuples: [(entity_text, entity_label), ...]
    """
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def minimal_clean_text(text):
    """
    Perform minimal cleaning: remove URLs, HTML tags, normalize spaces, and lowercase the text.
    """
    text = str(text)
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"<.*?>", "", text)      # remove HTML tags
    text = re.sub(r"\s+", " ", text)       # normalize whitespace
    return text.strip().lower()

# File paths (adjust these paths as needed)
input_path =   '/content/drive/MyDrive/AmazonReviews/cfu_dataset.json' # Unified dataset from previous merging steps
output_path = '/content/drive/MyDrive/AmazonReviews/pcfu_dataset.json' # after preprocessed with NER

# Load the unified dataset (assumed to be a list of product records)
with open(input_path, 'r') as f:
    unified_data = json.load(f)

# Process each product record to extract named entities
for index, product in enumerate(unified_data):
    combined_text = ""
    # Combine key product fields: title, description, and details
    if product.get("title"):
        combined_text += product["title"] + " "

    if product.get("description"):
        if isinstance(product["description"], list):
            combined_text += " ".join(product["description"]) + " "
        else:
            combined_text += product["description"] + " "

    if product.get("details") and isinstance(product["details"], dict):
        details_text = " ".join(str(val) for val in product["details"].values())
        combined_text += details_text + " "

    # Optionally, you can also include aggregated review text if desired.
    # For now, we'll only process the product-level fields.

    # Apply minimal cleaning before extracting entities (if necessary)
    combined_text_cleaned = minimal_clean_text(combined_text)

    # Extract entities using spaCy's NER
    product["named_entities"] = extract_entities(combined_text_cleaned)

    # Print status for this product (sample)
    print(f"Sample-{index + 1} product_record - {index + 1} is completed")

# Save the updated dataset with named_entities back to JSON
with open(output_path, 'w') as out_file:
    json.dump(unified_data, out_file, indent=4)

print(f"\nAll samples processed. Updated dataset saved to: {output_path}")

In [None]:
from transformers import pipeline

# Replace with the actual path where your fine-tuned model is saved.
model_path = "/content/drive/MyDrive/AmazonReviews/distilbert_finetuned"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

# Test the model on a sample review text
sample_text = "The product was received incomplete. I was supposed to received the full drum set as shown in the picture without the cymbals. But only one box was received with the bass drum and one tom"
result = sentiment_pipeline(sample_text)[0]
print(f"Sentiment: {'positive' if result['label'] == 'LABEL_1' else 'negative'} (score: {result['score']:.2f})")

Device set to use cuda:0


Sentiment: negative (score: 0.99)


In [None]:
import json
from transformers import pipeline

# Paths to input and output JSON files (adjust as needed)
input_json_path = '/content/drive/MyDrive/AmazonReviews/pcfu_dataset.json'  # Unified dataset (filtered, cleaned, preprocessed)
output_json_path = '/content/drive/MyDrive/AmazonReviews/final_dataset.json'  # Final updated dataset

# Load the unified dataset
with open(input_json_path, 'r') as f:
    data = json.load(f)

# Load the fine-tuned DistilBERT sentiment pipeline.
# Ensure that model_path points to your fine-tuned model directory.
model_path = "/content/drive/MyDrive/AmazonReviews/distilbert_finetuned"
bert_pipeline = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

# Initialize counters for evaluation
total_reviews = 0
correct_predictions = 0
positive_count = 0
negative_count = 0
product_record_counter = 1

# Process each product in the dataset using batch processing for reviews
for product in data:
    if "reviews" in product:
        reviews = product["reviews"]
        texts = []         # List of cleaned text for valid reviews
        valid_indices = [] # Indices corresponding to reviews with valid text

        for idx, review in enumerate(reviews):
            # Use the "cleaned_text" for prediction (if available; otherwise skip this review)
            review_text = review.get("cleaned_text", "")
            if review_text:
                texts.append(review_text)
                valid_indices.append(idx)
            total_reviews += 1

        # Process reviews in batch if any valid texts exist
        if texts:
            # Pass a list of texts; the pipeline will process in batch
            predictions = bert_pipeline(texts, truncation=True, max_length=512)
            # Assign predictions to the corresponding reviews based on their stored index
            for i, idx in enumerate(valid_indices):
                result = predictions[i]
                # Depending on your fine-tuned model, label may be 'LABEL_1' or 'POSITIVE'
                predicted_label = "positive" if result["label"] in ["LABEL_1", "POSITIVE"] else "negative"
                score = result["score"]
                review = reviews[idx]
                review["bert_sentiment"] = {"label": predicted_label, "score": score}

                # Remove the 'text' field to reduce file size (if present)
                if "text" in review:
                    del review["text"]

                # Compare predicted sentiment with the original sentiment (derived from rating)
                original_sentiment = review.get("sentiment", "")
                if original_sentiment == predicted_label:
                    correct_predictions += 1
                # Count the sentiment predictions
                if predicted_label == "positive":
                    positive_count += 1
                else:
                    negative_count += 1

    # Print dynamic status update for each processed product
    print(f"Sample product_record - {product_record_counter} is completed")
    product_record_counter += 1

# Compute overall accuracy (if at least one review was processed)
accuracy = correct_predictions / total_reviews if total_reviews > 0 else 0.0

# Save the updated dataset to a new JSON file with pretty printing
with open(output_json_path, 'w') as out_f:
    json.dump(data, out_f, indent=4)

print(f"\nTotal reviews processed: {total_reviews}")
print(f"Accuracy of bert_sentiment compared to original sentiment: {accuracy:.2f}")
print(f"Count of positive reviews (predicted by BERT): {positive_count}")
print(f"Count of negative reviews (predicted by BERT): {negative_count}")
print(f"Updated dataset saved to: {output_json_path}")