In [1]:
import polars as pl
import boto3
import sys
import os
import torch
import time

sys.path.append("../src")

In [2]:
from sentiment_analysis import (
    load_sentiment_model,
    get_sentiment,
    save_sentiment_checkpoint,
    merge_sentiment_checkpoints,
    merge_sentiment_with_summary
)

In [3]:
bucket = "amazon-electronics-dataset"

summarized_input = "s3://amazon-electronics-dataset/summarized_dataset/final_summarized_grouped_30700.csv"
sentiment_checkpoint_prefix = "s3://amazon-electronics-dataset/sentiment_checkpoints/"
sentiment_scores_output = "s3://amazon-electronics-dataset/sentiment_analysed_dataset/sentiment_scores_only.csv"
final_merged_output = "s3://amazon-electronics-dataset/sentiment_analysed_dataset/final_summarized_plus_sentiment.csv"

In [4]:
df_full = pl.read_csv(summarized_input)
print(df_full.shape)
df_full.head()

(31100, 11)


product_id,all_reviews,all_user_summaries,avg_rating,review_count,total_helpful_votes,dominant_style,oldest_review_timestamp,newest_review_timestamp,abstracted_summary,review_count_right
str,str,str,str,i64,str,str,i64,i64,str,i64
"""B00OHDONOA""","""i bought this mp3 player to us…","""Great MP3 with amazing battery…",,42,,,1429747200,1466294400,"""The agptek 2015 is a fantastic…",42
"""B01GHLYPWE""","""doesn t support 120hz 1080p no…","""Good customer service but prod…",,48,,,1468800000,1536883200,"""Great deal worked as expected …",48
"""B00F19Q3T2""","""the price is right but i wish …","""love this lens for my gopro.  …",,58,,,1383350400,1496275200,"""The price is right but i wish …",58
"""B001584QQA""","""one day i turned them to put t…","""They worked great until they b…",,42,,,1262304000,1524441600,"""One day i turned them to put t…",42
"""B007PJ4PKK""","""i use the logitech wireless ke…","""Fun pattern, great keystroke f…",,82,,,1340668800,1479600000,"""I use the logitech wireless ke…",82


In [8]:
df_full.filter(pl.col("abstracted_summary").is_null()).shape

(0, 11)

In [9]:
df_full.select(pl.col("product_id").n_unique())

product_id
u32
31100


In [10]:
# Load the model
tokenizer, model, device = load_sentiment_model()
print("Model loaded!")
print("Using device:", device)

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

2025-11-27 19:33:12.273885: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764271992.289681    2822 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764271992.294643    2822 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-27 19:33:12.309994: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Model loaded!
Using device: cuda


In [12]:
# Test sentiment on first sample

sample = df_full.row(0, named=True)

text = sample["abstracted_summary"]
pid = sample["product_id"]

print("Product ID:", pid)
print("\nABSTRACTED SUMMARY\n")
print(text)

# Run sentiment
label, score = get_sentiment(tokenizer, model, device, text)

print("\nSENTIMENT OUTPUT")
print("Sentiment Label:", label)
print("Sentiment Score:", score)

Product ID: B00OHDONOA

ABSTRACTED SUMMARY

The agptek 2015 is a fantastic tool for capturing the early phases of a songwriting work in progress .<n>The agptek 2015 makes excellent recordings and stores them in a very logical folder system . The agptek 2015 is an excellent mp3 player it enables setting up play lists .<n>i prefer to simply create a folder and drop a group of mp3 song files into that folder .<n>The agptek is feather light but seems well made and will likely stand up to normal use . mp3 player takes practice learning to work it but otherwise wonderful just needed something to play mp3 s while i work out perfect for that and added bonus is batteries stay charged a long time .<n>i found this one does what i want w o a hitch long battery life with a micro sd card slot negotiating the menu takes practice as it is not as intuitive as other players easy .

SENTIMENT OUTPUT
Sentiment Label: positive
Sentiment Score: 0.9380781054496765


In [13]:
# Full sentiment loop with checkpoints

results = []
checkpoint_num = 1
start_time = time.time()

print("Starting SENTIMENT ANALYSIS")

for i, row in enumerate(df_full.iter_rows(named=True), start=1):

    pid = row["product_id"]
    text = row["abstracted_summary"]

    label, score = get_sentiment(tokenizer, model, device, text)

    results.append({
        "product_id": pid,
        "sentiment_label": label,
        "sentiment_score": score
    })

    if i % 100 == 0:
        batch_df = pl.DataFrame(results)
        save_sentiment_checkpoint(batch_df, checkpoint_num, bucket, sentiment_checkpoint_prefix)
        results = []
        checkpoint_num += 1
        print(f"Checkpoint saved at {i} products")

    if i % 500 == 0:
        elapsed = time.time() - start_time
        eta_hours = (elapsed / i) * (df_full.shape[0] - i) / 3600
        print(f"[Progress] {i}/{df_full.shape[0]} | ETA ~ {eta_hours:.2f} hours")

if results:
    batch_df = pl.DataFrame(results)
    save_sentiment_checkpoint(batch_df, checkpoint_num, bucket, sentiment_checkpoint_prefix)

print("Sentiment analysis completed.")

Starting SENTIMENT ANALYSIS
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_1.csv
Checkpoint saved at 100 products
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_2.csv
Checkpoint saved at 200 products
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_3.csv
Checkpoint saved at 300 products
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_4.csv
Checkpoint saved at 400 products
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_5.csv
Checkpoint saved at 500 products
[Progress] 500/31100 | ETA ~ 0.05 hours
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_6.csv
Checkpoint saved at 600 products
[Checkpoint] Uploaded → s3://amazon-electronics-dataset/sentiment_checkpoints/sentiment_checkpoint_7.csv

In [14]:
# Merge sentiment files

sentiment_scores = merge_sentiment_checkpoints(
    bucket=bucket,
    prefix=sentiment_checkpoint_prefix,
    output_path=sentiment_scores_output
)

sentiment_scores.head()

[Final Merge] Created sentiment_scores_only.csv


product_id,sentiment_label,sentiment_score
str,str,f64
"""B00OHDONOA""","""positive""",0.938078
"""B01GHLYPWE""","""neutral""",0.365432
"""B00F19Q3T2""","""positive""",0.829172
"""B001584QQA""","""neutral""",0.482159
"""B007PJ4PKK""","""positive""",0.765428


In [15]:
# Merge sentiment back into full dataset

df_final = merge_sentiment_with_summary(df_full, sentiment_scores)
df_final.head()
print(df_final.shape)

(31100, 13)


In [16]:
# Save final merged dataset to S3

local_final = "/tmp/final_summarized_plus_sentiment.csv"
df_final.write_csv(local_final)

s3 = boto3.client("s3")
s3.upload_file(local_final, bucket, final_merged_output.replace(f"s3://{bucket}/",""))

print("FINAL DATASET SAVED", final_merged_output)

FINAL DATASET SAVED s3://amazon-electronics-dataset/sentiment_analysed_dataset/final_summarized_plus_sentiment.csv
