### Imports

In [1]:
# %pip install pydirectory

In [2]:
from bigdata_a3_utils import *
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil
import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np
from datasets import load_dataset, load_from_disk
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json
import dask.dataframe as dd

### Files/folders

In [3]:
raw_files_path = "D:\\AS3\\Raw"
extraction_path = "D:\\AS3\\temp"

review_pkls_path = "C:\\Big Data\\A3\\Data\\review_pkl" #path of the review pkl files
meta_pkls_path = "C:\\Big Data\\A3\\Data\\meta_pkl"     #path of the meta pkl files

### Categories to process

In [4]:
# categories = ['Unknown', 'Magazine_Subscriptions', 'Movies_and_TV', "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewellery", "Digital_Music", "Hanmade_Products", "Baby_Products", "Beauty_and_Personal_Care", "Electronics"] # These are the ones that we have left to run

In [5]:
# This function handles the extraction of the brands
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

### Clean categories

In [6]:
def clean_data_dask(category, review_df, meta_df):
    output_dir = r"D:\AS3\Cleaned"
    os.makedirs(output_dir, exist_ok=True)

    # print("Reading parquet files as Dask DataFrames")
    review_df = dd.from_pandas(review_df, npartitions=4)
    meta_df = dd.from_pandas(meta_df, npartitions=4)

    print("Merging review and meta on 'parent_asin'")
    merged = dd.merge(review_df, meta_df, on="parent_asin", how="left")

    if "rating" in merged.columns:
        merged = merged[merged["rating"].between(1, 5)]
    if "text" in merged.columns:
        merged = merged[merged["text"].notnull() & (merged["text"].str.strip() != "")]

    def safe_extract_brand(row):
        try:
            return extract_brand(row.get("details"), row.get("store"))
        except:
            return "Unknown"

    print("Extracting brand...")
    merged["brand"] = merged.map_partitions(
        lambda df: df.apply(lambda row: safe_extract_brand(row), axis=1),
        meta=("brand", "object")
    )
    merged["brand"] = merged["brand"].fillna("Unknown")
    print("Dropping duplicates...")
    merged = merged.drop_duplicates(subset=["user_id", "asin", "text"])

    if "text" in merged.columns:
        merged["review_length"] = merged["text"].str.split().map(lambda x: len(x) if x else 0, meta=("review_length", "int"))
        merged["text"] = merged["text"].str.slice(0, 10000)
    if "timestamp" in merged.columns:
        merged["year"] = dd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    necessary_columns = [
        "user_id", "asin", "parent_asin", "rating", "text", "verified_purchase",
        "helpful_vote", "review_length", "year", "brand", "main_category",
        "title", "average_rating", "rating_number", "price"
    ]
    merged = merged[[col for col in necessary_columns if col in merged.columns]]

    output_file = os.path.join(output_dir, f"{category}_cleaned.parquet")
    merged.to_parquet(output_file, compression="snappy", write_index=False)
    print(f"Cleaned data saved to {output_file}")

    print("Frame shape: ", merged.shape)


In [7]:
# def clean_data(category, review_df, meta_df):

#     output_dir = r"D:\AS3\pkl_dir"
#     os.makedirs(output_dir, exist_ok=True)

#     print("=== Merging review and meta on 'parent_asin' (LEFT JOIN) ===")
#     merged = pd.merge(review_df, meta_df, on="parent_asin", how="left")
#     print(f"After merge: {len(merged)} rows")

#     # invalid ratings
#     if "rating" in merged.columns:
#         before = len(merged)
#         merged = merged[merged["rating"].between(1.0, 5.0, inclusive="both")]
#         print(f"After filtering invalid ratings: {len(merged)} rows (dropped {before - len(merged)})")
    
#     # empty review texts
#     if "text" in merged.columns:
#         before = len(merged)
#         merged = merged[merged["text"].notna() & (merged["text"].str.strip() != "")]
#         print(f"After dropping empty text: {len(merged)} rows (dropped {before - len(merged)})")

#     #  brand
#     print("Extracting brand from metadata...")
#     merged["brand"] = merged.apply(
#         lambda row: extract_brand(row.get("details"), row.get("store")), axis=1
#     )
#     merged["brand"].fillna("Unknown", inplace=True)

#     # duplicates
#     before = len(merged)
#     merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)
#     print(f"After removing duplicates: {len(merged)} rows (dropped {before - len(merged)})")

#     # derived columns
#     if "text" in merged.columns:
#         merged["review_length"] = merged["text"].str.split().apply(len)

#     if "timestamp" in merged.columns:
#         merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

#     # cleaned data
#     output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
#     merged.to_pickle(output_file, compression="bz2")
#     print(f"Saved cleaned file to {output_file}")

#     return merged

### Process Categories

In [8]:
def load_raw_rev_and_meta(extraction_path, category):
    file_path = os.path.join(raw_files_path, f"raw_review_{category}.tar.bz2")
    print(f"Loading review: {file_path}")
    review_dataset = load_compressed_dataset(file_path, extraction_path)
    review_df = review_dataset["full"].to_pandas()
    review_df.to_parquet(os.path.join(extraction_path, f"{category}_review.parquet"))

    file_path = os.path.join(raw_files_path, f"raw_meta_{category}.tar.bz2")
    print(f"Loading meta: {file_path}")
    meta_dataset = load_compressed_dataset(file_path, extraction_path)
    meta_df = meta_dataset["full"].to_pandas()
    meta_df.to_parquet(os.path.join(extraction_path, f"{category}_meta.parquet"))

    return review_df, meta_df


## Define Categories that will be cleaned

In [9]:
categories = [
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Sports_and_Outdoors",
    "Subscription_Boxes",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
    "Video_Games",
    "Unknown"
]


In [10]:
for category in categories:
    rev_df, meta_df = load_raw_rev_and_meta(extraction_path, category)
    clean_data_dask(category, rev_df, meta_df)


Loading review: D:\AS3\Raw\raw_review_Office_Products.tar.bz2
Extracting D:\AS3\Raw\raw_review_Office_Products.tar.bz2 to D:\AS3\temp...
Loading dataset from D:\AS3\temp\raw_review_Office_Products...
Loading meta: D:\AS3\Raw\raw_meta_Office_Products.tar.bz2
Extracting D:\AS3\Raw\raw_meta_Office_Products.tar.bz2 to D:\AS3\temp...
Loading dataset from D:\AS3\temp\raw_meta_Office_Products...
Merging review and meta on 'parent_asin'
Extracting brand...
Dropping duplicates...


ArrowInvalid: Take operation overflowed binary array capacity