# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [1]:
%pip install fastparquet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import findspark
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil

import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np

from datasets import load_dataset
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json
import dask.dataframe as dd
import gc


### Files/folders

In [3]:
# raw_files_path = "D:\\AS3\\Raw"
# extraction_path = "D:\\AS3\\temp"

# # review_pkls_path = "C:\\Big Data\\A3\\Data\\review_pkl" #path of the review pkl files
# # meta_pkls_path = "C:\\Big Data\\A3\\Data\\meta_pkl"     #path of the meta pkl files

## Function for extraction of RAW .tar files, creates .arrow files

In [4]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")


## Preprocess Category: Creates a temp path for management of Disk Storage <br>



In [5]:
import os
import shutil
from datasets import load_dataset
from pathlib import Path
import pandas as pd

def preprocess_category(review_tar_path, meta_tar_path, output_folder, category, batch_size=1000):
    temp_path = os.path.join(output_folder, "temp_extract", category)
    os.makedirs(temp_path, exist_ok=True)
    os.makedirs(output_folder, exist_ok=True)

    print(f"Extracting tar files for {category}...")
    extract_tar_bz2(review_tar_path, temp_path)
    extract_tar_bz2(meta_tar_path, temp_path)

    arrow_files = list(Path(temp_path).rglob("*.arrow"))
    print(f"Found {len(arrow_files)} Arrow files")

    batch_num = 0
    total_rows = 0

    for arrow_file in arrow_files:
        try:
            is_meta = "meta" in str(arrow_file).lower()
            folder_name = "meta" if is_meta else "reviews"
            out_path = os.path.join(output_folder, f"{folder_name}_parquet")
            os.makedirs(out_path, exist_ok=True)

            dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

            batch = []
            seen_keys = set()

            for row in dataset:
                if not row:
                    continue
                if not is_meta:
                    key = (row.get("user_id"), row.get("asin"), row.get("text"))
                    if key in seen_keys:
                        continue
                    seen_keys.add(key)
                batch.append(row)

                if len(batch) >= batch_size:
                    df = pd.DataFrame(batch)
                    df.to_parquet(os.path.join(out_path, f"{category}_batch_{batch_num}.parquet"), index=False)
                    print(f"Saved batch {batch_num} ({len(batch)} rows)")
                    batch = []
                    batch_num += 1
                    total_rows += 1

            if batch:
                df = pd.DataFrame(batch)
                df.to_parquet(os.path.join(out_path, f"{category}_batch_{batch_num}.parquet"), index=False)
                print(f"Saved final batch {batch_num} ({len(batch)} rows)")

        except Exception as e:
            print(f"Error processing {arrow_file.name}: {e}")

    shutil.rmtree(temp_path)
    print(f"Temp folder removed: {temp_path}")


Meta and Review parsing

In [6]:

def convert_to_dd(folder, category):
    files = [os.path.join(folder, f) for f in os.listdir(folder)
             if f.endswith(".parquet") and category.lower() in f.lower()]
    if not files:
        print("No parquet files found")
        return None
    df = dd.read_parquet(files)
    print(f"Loaded {len(files)} files into Dask DataFrame")
    return df


Dealing with the brand

In [7]:
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

Clean data

In [8]:
def clean_data_dask(category, review_df, meta_df):
    output_dir = r"D:\AS3\Cleaned"
    os.makedirs(output_dir, exist_ok=True)

    # print("Reading parquet files as Dask DataFrames")
    # review_df = dd.read_parquet(review_path)
    # meta_df = dd.read_parquet(meta_path)

    print("Merging review and meta on 'parent_asin'")
    merged = dd.merge(review_df, meta_df, on="parent_asin", how="left")

    if "rating" in merged.columns:
        merged = merged[merged["rating"].between(1, 5)]
    if "text" in merged.columns:
        merged = merged[merged["text"].notnull() & (merged["text"].str.strip() != "")]

    def safe_extract_brand(row):
        try:
            return extract_brand(row.get("details"), row.get("store"))
        except:
            return "Unknown"

    print("Extracting brand...")
    merged["brand"] = merged.map_partitions(
        lambda df: df.apply(lambda row: safe_extract_brand(row), axis=1),
        meta=("brand", "object")
    )
    merged["brand"] = merged["brand"].fillna("Unknown")
    print("Dropping duplicates...")
    merged = merged.drop_duplicates(subset=["user_id", "asin", "text"])

    if "text" in merged.columns:
        merged["review_length"] = merged["text"].str.split().map(lambda x: len(x) if x else 0, meta=("review_length", "int"))
    if "timestamp" in merged.columns:
        merged["year"] = dd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    necessary_columns = [
        "user_id", "asin", "parent_asin", "rating", "text", "verified_purchase",
        "helpful_vote", "review_length", "year", "brand", "main_category",
        "title", "average_rating", "rating_number", "price"
    ]
    merged = merged[[col for col in necessary_columns if col in merged.columns]]

    output_file = os.path.join(output_dir, f"{category}_cleaned.parquet")
    merged.to_parquet(output_file, compression="snappy", write_index=False)
    print(f"Cleaned data saved to {output_file}")
    return merged


## Define Categories that will be cleaned

In [9]:
categories = [
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Sports_and_Outdoors",
    "Subscription_Boxes",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
    "Video_Games",
    "Unknown"
]


## Running Preprocess then Clean_Data for the Categories defined above

In [None]:
import os
import shutil
import gc
from pathlib import Path

base_dir = r"D:\AS3\Raw"
raw_dir = os.path.join(base_dir, "raw_files")
output_dir = os.path.join(base_dir, "output_folder")

for category in categories:
    print(f"\n=== Processing category: {category} ===")

    review_tar = os.path.join(base_dir, f"raw_review_{category}.tar.bz2")
    meta_tar = os.path.join(base_dir, f"raw_meta_{category}.tar.bz2")

    try:
        preprocess_category(review_tar, meta_tar, output_dir, category)

        review_df = convert_to_dd(os.path.join(output_dir, "reviews_parquet"), category)
        meta_df = convert_to_dd(os.path.join(output_dir, "meta_parquet"), category)

        if review_df is not None and meta_df is not None:
            cleaned = clean_data_dask(category, review_df, meta_df)
            print(f"Cleaned {category}")
    except Exception as e:
        print(f"Error while processing {category}: {e}")

    finally:
        gc.collect()
        for sub in ["reviews_parquet", "meta_parquet", "temp_extract"]:
            path = os.path.join(output_dir, sub)
            if os.path.exists(path):
                try:
                    shutil.rmtree(path)
                    print(f"Deleted: {path}")
                except Exception as e:
                    print(f"Couldn't delete {path}: {e}")



=== Processing category: Office_Products ===
Extracting tar files for Office_Products...
Extracting D:\AS3\Raw\raw_review_Office_Products.tar.bz2 to D:\AS3\Raw\output_folder\temp_extract\Office_Products
Extracting D:\AS3\Raw\raw_meta_Office_Products.tar.bz2 to D:\AS3\Raw\output_folder\temp_extract\Office_Products
Found 13 Arrow files
Saved batch 0 (1000 rows)
Saved batch 1 (1000 rows)
Saved batch 2 (1000 rows)
Saved batch 3 (1000 rows)
Saved batch 4 (1000 rows)
Saved batch 5 (1000 rows)
Saved batch 6 (1000 rows)
Saved batch 7 (1000 rows)
Saved batch 8 (1000 rows)
Saved batch 9 (1000 rows)
Saved batch 10 (1000 rows)
Saved batch 11 (1000 rows)
Saved batch 12 (1000 rows)
Saved batch 13 (1000 rows)
Saved batch 14 (1000 rows)
Saved batch 15 (1000 rows)
Saved batch 16 (1000 rows)
Saved batch 17 (1000 rows)
Saved batch 18 (1000 rows)
Saved batch 19 (1000 rows)
Saved batch 20 (1000 rows)
Saved batch 21 (1000 rows)
Saved batch 22 (1000 rows)
Saved batch 23 (1000 rows)
Saved batch 24 (1000 rows