In [4]:
!pip install polars



In [5]:
import polars as pl
import pandas as pd
import sys

sys.path.append("../src")
from preprocess import clean_text, flatten_style

In [6]:
#Load raw JSON
BUCKET = "amazon-electronics-dataset"
KEY = "Raw_Dataset/Electronics_5.json"

df_lazy = pl.scan_ndjson(f"s3://{BUCKET}/{KEY}")
df_lazy

In [7]:
#Selecting required columns
df_lazy = df_lazy.select([
    "reviewerID",
    "asin",
    "reviewText",
    "summary",
    "overall",
    "vote",
    "verified",
    "reviewTime",
    "unixReviewTime",
    "style",
    "image",
    "reviewerName"
])

In [8]:
df = df_lazy.collect()
df.head()

reviewerID,asin,reviewText,summary,overall,vote,verified,reviewTime,unixReviewTime,style,image,reviewerName
str,str,str,str,f64,str,bool,str,i64,struct[1],list[str],str
"""AAP7PPBU72QFM""","""0151004714""","""This is the best novel I have …","""A star is born""",5.0,"""67""",True,"""09 18, 1999""",937612800,"{"" Hardcover""}",,"""D. C. Carrad"""
"""A2E168DTVGE6SV""","""0151004714""","""Pages and pages of introspecti…","""A stream of consciousness nove…",3.0,"""5""",True,"""10 23, 2013""",1382486400,"{"" Kindle Edition""}",,"""Evy"""
"""A1ER5AYS3FQ9O3""","""0151004714""","""This is the kind of novel to r…","""I'm a huge fan of the author a…",5.0,"""4""",False,"""09 2, 2008""",1220313600,"{"" Paperback""}",,"""Kcorn"""
"""A1T17LMQABMBN5""","""0151004714""","""What gorgeous language! What a…","""The most beautiful book I have…",5.0,"""13""",False,"""09 4, 2000""",968025600,"{"" Hardcover""}",,"""Caf Girl Writes"""
"""A3QHJ0FXK33OBE""","""0151004714""","""I was taken in by reviews that…","""A dissenting view--In part.""",3.0,"""8""",True,"""02 4, 2000""",949622400,"{"" Hardcover""}",,"""W. Shane Schmidt"""


In [9]:
#Dropping rows with missing fields
df = (
    df
    .filter(pl.col("asin").is_not_null())
    .filter(pl.col("reviewText").is_not_null())
)

In [10]:
#Rename columns
df = df.rename({
    "reviewerID": "reviewer_id",
    "asin": "product_id",
    "reviewText": "original_review",
    "summary": "user_summary",
    "overall": "rating",
    "vote": "helpful_votes",
    "verified": "is_verified",
    "reviewTime": "review_date",
    "unixReviewTime": "review_timestamp"
})

In [11]:
#Helpful votes
df = df.with_columns([
    pl.col("helpful_votes")
      .str.replace(",", "")
      .cast(pl.Int64, strict=False)
      .fill_null(0)
])

In [12]:
#Fill null summaries
df = df.with_columns([
    pl.col("user_summary")
      .fill_null("No summary provided")
])

In [14]:
#Flatten style metadata
# Flatten style metadata
df = df.with_columns([
    pl.col("style")
      .map_elements(lambda x: flatten_style(x))
      .alias("product_style")
])

In [16]:
#Clean review text
df = df.with_columns([
    pl.col("original_review")
      .map_elements(lambda x: clean_text(x))
      .alias("cleaned_review")
])

In [19]:
#Convert Dates & Timestamp
df = df.with_columns([
    pl.col("review_date")
      .str.to_date(format="%m %d, %Y", strict=False)
      .alias("review_date"),

    pl.col("review_timestamp").cast(pl.Int64)
])


In [20]:
#Drop useless columns
df = df.drop(["image", "reviewerName", "style"])

In [21]:
#Filtering short reviews
df = df.filter(pl.col("cleaned_review").str.len_chars() > 20)

In [23]:
#Filter products with fewer than 5 reviews
counts = (
    df
    .group_by("product_id")
    .count()
    .rename({"count": "review_count"})
)

df = df.join(counts, on="product_id")
df = df.filter(pl.col("review_count") >= 5)

  .count()


In [24]:
#Save as CSV 
output_csv = "electronics_clean.csv"

df.to_pandas().to_csv(output_csv, index=False)

# Upload to S3
!aws s3 cp electronics_clean.csv s3://amazon-electronics-dataset/Preprocessed_dataset/

print("Cleaned dataset uploaded to: s3://amazon-electronics-dataset/Preprocessed_dataset/")

upload: ./electronics_clean.csv to s3://amazon-electronics-dataset/Preprocessed_dataset/electronics_clean.csv
Cleaned dataset uploaded to: s3://amazon-electronics-dataset/Preprocessed_dataset/
