In [15]:
pip install polars

Note: you may need to restart the kernel to use updated packages.


In [16]:
import polars as pl
import boto3
import sys

In [17]:
metadata_path = "s3://amazon-electronics-dataset/raw_dataset_metadata/meta_Electronics.json.gz"
sentiment_path = "s3://amazon-electronics-dataset/sentiment_analysed_dataset/final_summarized_plus_sentiment.csv"
bucket = "amazon-electronics-dataset"
output_path = "sentiment_analysed_dataset/final_merged_metadata_summary_sentiment.csv"

In [18]:
# Load 31,100 summarized + sentiment dataset
print("Loading summarized + sentiment dataset...")
df_sent = pl.read_csv(sentiment_path)
print("Sentiment+summary dataset shape:", df_sent.shape)

# Extract product_id list (31100 unique)
product_ids = df_sent["product_id"].unique()
print("Unique product_ids found:", product_ids.len())

Loading summarized + sentiment dataset...
Sentiment+summary dataset shape: (31100, 13)
Unique product_ids found: 31100


In [20]:
import polars as pl

# Real schema for Electronics meta:
# ["asin","title","feature","description","brand","category","rank",
#  "main_cat","also_buy","also_view","similar_item","date","price",
#  "tech1","tech2","fit","imageURL","imageURLHighRes"]

print("Loading metadata (streaming from .json.gz)")

df_meta = (
    pl.scan_ndjson(metadata_path)
      .select([
          pl.col("asin").alias("product_id"),
          pl.col("title"),
          pl.col("feature"),
          pl.col("description"),
          pl.col("brand"),
          pl.col("category").alias("categories"),   # renamed
          pl.col("rank").alias("salesRank"),        # renamed
          pl.col("tech1"),
          pl.col("tech2"),
          pl.col("also_buy").alias("related")       # closest equivalent
      ])
      .filter(pl.col("product_id").is_in(product_ids))
      .collect()
)

print("Filtered metadata shape:", df_meta.shape)

Loading metadata (streaming from .json.gz)


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  .collect()


Filtered metadata shape: (32461, 10)


In [23]:
df_meta = (
    df_meta
      .group_by("product_id")
      .agg(pl.all().first())
)

print("Deduplicated metadata shape:", df_meta.shape)

Deduplicated metadata shape: (31066, 10)


In [33]:
# MERGE

print("Merging datasets")
df_final = df_sent.join(df_meta, on="product_id", how="left")

print("Merged final dataset shape:", df_final.shape)

# Sanity check — should be 31,100 rows
print("Row count after merge:", df_final.height)

Merging datasets
Merged final dataset shape: (31100, 22)
Row count after merge: 31100


In [34]:
print(df_final.dtypes)

[String, String, String, String, Int64, String, String, Int64, Int64, String, Int64, String, Float64, String, List(String), List(String), String, List(String), String, String, String, List(String)]


In [35]:
print(df_final.columns)

['product_id', 'all_reviews', 'all_user_summaries', 'avg_rating', 'review_count', 'total_helpful_votes', 'dominant_style', 'oldest_review_timestamp', 'newest_review_timestamp', 'abstracted_summary', 'review_count_right', 'sentiment_label', 'sentiment_score', 'title', 'feature', 'description', 'brand', 'categories', 'salesRank', 'tech1', 'tech2', 'related']


In [40]:
list_cols = ["feature", "description", "categories", "related"]

df_final = df_final.with_columns(
    [pl.col(c).list.join(" | ").alias(c) for c in list_cols]
)

In [41]:
# SAVE TO S3

local_file = "/tmp/final_merged_metadata_summary_sentiment.csv"
df_final.write_csv(local_file)

s3 = boto3.client("s3")
s3.upload_file(local_file, bucket, output_path)

print(f"Final dataset uploaded → s3://{bucket}/{output_path}")

Final dataset uploaded → s3://amazon-electronics-dataset/sentiment_analysed_dataset/final_merged_metadata_summary_sentiment.csv
