# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [1]:
# import findspark
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil

import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np


  from pandas.core import (


In [2]:
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json
import dask.dataframe as dd
import tarfile
import os

In [3]:
from dask.distributed import Client
client = Client()
print(client)


<Client: 'tcp://127.0.0.1:45621' processes=4 threads=4, memory=15.62 GiB>


## Function for extraction of RAW .tar files, creates .arrow files

In [4]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")


## Preprocess Category: Creates a temp path for management of Disk Storage <br>



In [5]:
import os
import shutil
from datasets import load_dataset
from pathlib import Path
import pandas as pd

def preprocess_category(review_tar_path, meta_tar_path, output_folder, category, batch_size=1000):
    temp_path = os.path.join(output_folder, "temp_extract", category)
    os.makedirs(temp_path, exist_ok=True)
    os.makedirs(output_folder, exist_ok=True)

    print(f"Extracting tar files for {category}...")
    extract_tar_bz2(review_tar_path, temp_path)
    extract_tar_bz2(meta_tar_path, temp_path)

    arrow_files = list(Path(temp_path).rglob("*.arrow"))
    print(f"Found {len(arrow_files)} Arrow files")

    batch_num = 0
    total_rows = 0

    for arrow_file in arrow_files:
        try:
            is_meta = "meta" in str(arrow_file).lower()
            folder_name = "meta" if is_meta else "reviews"
            out_path = os.path.join(output_folder, f"{folder_name}_parquet")
            os.makedirs(out_path, exist_ok=True)

            dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

            batch = []
            seen_keys = set()

            for row in dataset:
                if not row:
                    continue
                if not is_meta:
                    key = (row.get("user_id"), row.get("asin"), row.get("text"))
                    if key in seen_keys:
                        continue
                    seen_keys.add(key)
                batch.append(row)

                if len(batch) >= batch_size:
                    df = pd.DataFrame(batch)
                    df.to_parquet(os.path.join(out_path, f"{category}_batch_{batch_num}.parquet"), index=False)
                    print(f"Saved batch {batch_num} ({len(batch)} rows)")
                    batch = []
                    batch_num += 1
                    total_rows += 1

            if batch:
                df = pd.DataFrame(batch)
                df.to_parquet(os.path.join(out_path, f"{category}_batch_{batch_num}.parquet"), index=False)
                print(f"Saved final batch {batch_num} ({len(batch)} rows)")

        except Exception as e:
            print(f"Error processing {arrow_file.name}: {e}")

    shutil.rmtree(temp_path)
    print(f"Temp folder removed: {temp_path}")


Meta and Review parsing

In [6]:

def convert_to_dd(folder, category):
    files = [os.path.join(folder, f) for f in os.listdir(folder)
             if f.endswith(".parquet") and category.lower() in f.lower()]
    if not files:
        print("No parquet files found")
        return None
    df = dd.read_parquet(files)
    print(f"Loaded {len(files)} files into Dask DataFrame")
    return df


Dealing with the brand

In [7]:
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

Clean data

In [8]:
import dask.dataframe as dd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import subprocess
from dask.diagnostics import ProgressBar
from io import BytesIO


def clean_data_dask(category, review_df, meta_df):
    print("Merging review and meta on 'parent_asin'")
    merged = dd.merge(review_df, meta_df, on="parent_asin", how="left")

    print("Filtering bad data")
    if "rating" in merged.columns:
        merged = merged[merged["rating"].between(1, 5)]
    if "text" in merged.columns:
        merged = merged[merged["text"].notnull() & (merged["text"].str.strip() != "")]

    print("Extracting brand")
    def fast_extract_brand(details, store):
        if isinstance(details, dict) and details.get("brand"):
            return details["brand"]
        elif isinstance(store, str) and store.strip():
            return store
        return "Unknown"

    merged["brand"] = merged.map_partitions(
        lambda df: df.apply(lambda row: fast_extract_brand(row.get("details"), row.get("store")), axis=1),
        meta=("brand", "object")
    )

    print("Computing derived columns")
    if "text" in merged.columns:
        merged["review_length"] = merged["text"].str.split().map(
            lambda x: len(x) if x else 0, meta=("review_length", "int")
        )
    if "timestamp" in merged.columns:
        merged["year"] = dd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    print("Selecting necessary columns")
    necessary_columns = [
        "user_id", "asin", "parent_asin", "rating", "text", "verified_purchase",
        "helpful_vote", "review_length", "year", "brand", "main_category",
        "title", "average_rating", "rating_number", "price"
    ]
    merged = merged[[col for col in necessary_columns if col in merged.columns]]

    print("Casting columns to match the expected schema")
    dtype_mapping = {
        "verified_purchase": "string",
        "helpful_vote": "float64",
        "rating_number": "float64",
        "price": "string",
    }
    merged = merged.astype(dtype_mapping)

    print("Repartitioning to a single partition for saving as one file")
    merged = merged.repartition(npartitions=1)

    print("Writing Parquet data to memory")
    buffer = BytesIO()
    with ProgressBar():
        merged.compute().to_parquet(buffer, engine="pyarrow", compression="snappy", index=False)

    print("Uploading Parquet data to Google Drive")
    output_file = f"{category}_cleaned.parquet"
    try:
        # Write the buffer to a temporary file for rclone
        temp_file = f"/tmp/{output_file}"
        with open(temp_file, "wb") as f:
            f.write(buffer.getvalue())

        # Use rclone to upload the file
        subprocess.run(
            [
                "rclone", "copy", temp_file,
                "googdrive:", "--drive-shared-with-me", "--drive-root-folder-id", "1wgVfpqS9BJE2IvTN-BJSbT3kEbhMhaOC"
            ],
            check=True
        )
        print(f"Uploaded {output_file} to Google Drive folder: cleaned")
    except subprocess.CalledProcessError as e:
        print(f"Error uploading to Google Drive: {e}")
    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

    return merged

## Define Categories that will be cleaned

In [9]:
# categories = [
#     "Grocery_and_Gourmet_Food",
#     "Handmade_Products",
#     "Health_and_Household",
#     "Home_and_Kitchen",
#     "Industrial_and_Scientific",
#     "Kindle_Store",
#     "Magazine_Subscriptions",
#     "Movies_and_TV",
#     "Musical_Instruments"
# ]

categories = [
    "Subscription_Boxes"
]


## Running Preprocess then Clean_Data for the Categories defined above

In [10]:
import os
import shutil
import gc
from pathlib import Path

base_dir = r"/root"
raw_dir = os.path.join(base_dir, "Data")
output_dir = os.path.join(base_dir, "output_folder")

# Store cleaned Dask DataFrames for each category
cleaned_all_categories = {}

for category in categories:
    print(f"\n=== Processing category: {category} ===")

    review_tar = os.path.join(raw_dir, f"raw_review_{category}.tar.bz2")
    meta_tar = os.path.join(raw_dir, f"raw_meta_{category}.tar.bz2")

    try:
        preprocess_category(review_tar, meta_tar, output_dir, category)

        review_df = convert_to_dd(os.path.join(output_dir, "reviews_parquet"), category)
        meta_df = convert_to_dd(os.path.join(output_dir, "meta_parquet"), category)

        if review_df is not None and meta_df is not None:
            cleaned = clean_data_dask(category, review_df, meta_df)
            cleaned_all_categories[category] = cleaned
            print(f"Cleaned and stored: {category}")
    except Exception as e:
        print(f"Error while processing {category}: {e}")

    finally:
        gc.collect()
        for sub in ["reviews_parquet", "meta_parquet", "temp_extract"]:
            path = os.path.join(output_dir, sub)
            if os.path.exists(path):
                try:
                    shutil.rmtree(path)
                    print(f"Deleted: {path}")
                except Exception as e:
                    print(f"Couldn't delete {path}: {e}")



=== Processing category: Subscription_Boxes ===
Extracting tar files for Subscription_Boxes...
Extracting /root/Data/raw_review_Subscription_Boxes.tar.bz2 to /root/output_folder/temp_extract/Subscription_Boxes


Extracting /root/Data/raw_meta_Subscription_Boxes.tar.bz2 to /root/output_folder/temp_extract/Subscription_Boxes
Found 2 Arrow files
Saved batch 0 (1000 rows)
Saved batch 1 (1000 rows)
Saved batch 2 (1000 rows)
Saved batch 3 (1000 rows)
Saved batch 4 (1000 rows)
Saved batch 5 (1000 rows)
Saved batch 6 (1000 rows)
Saved batch 7 (1000 rows)
Saved batch 8 (1000 rows)
Saved batch 9 (1000 rows)
Saved batch 10 (1000 rows)
Saved batch 11 (1000 rows)
Saved batch 12 (1000 rows)
Saved batch 13 (1000 rows)
Saved batch 14 (1000 rows)
Saved batch 15 (1000 rows)
Saved final batch 16 (16 rows)
Saved final batch 16 (641 rows)
Temp folder removed: /root/output_folder/temp_extract/Subscription_Boxes
Loaded 17 files into Dask DataFrame
Loaded 1 files into Dask DataFrame
Merging review and meta on 'parent_asin'
Filtering bad data
Extracting brand
Computing derived columns
Selecting necessary columns
Casting columns to match the expected schema
Repartitioning to a single partition for saving as one file
Wr

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


Uploading Parquet data to Google Drive
Uploaded Subscription_Boxes_cleaned.parquet to Google Drive folder: cleaned
Cleaned and stored: Subscription_Boxes
Deleted: /root/output_folder/reviews_parquet
Deleted: /root/output_folder/meta_parquet
Deleted: /root/output_folder/temp_extract


In [11]:
combined_cleaned_df = dd.concat(cleaned_all_categories.values())

TypeError: dfs must be a list of DataFrames/Series objects

## EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import numpy as np

In [None]:
print("Running Dask-compatible EDA...")

  # Ensure ddf is defined before using it
if combined_cleaned_df is not None:
    # Star Rating Histogram (1–5)
    rating_counts =  combined_cleaned_df["rating"].value_counts().compute().sort_index()
    rating_counts.plot(kind="bar")
    plt.xlabel("Star Rating")
    plt.ylabel("Number of Reviews")
    plt.title("Distribution of Star Ratings")
    plt.show()
else:
    print("Error: 'ddf' is not defined. Please define 'ddf' before running this cell.")

In [None]:
if combined_cleaned_df is not None:
    top_categories = combined_cleaned_df["main_category"].value_counts().compute().head(10)
    top_categories.plot(kind="bar")
    plt.xlabel("Main Category")
    plt.ylabel("Review Count")
    plt.title("Top 10 Categories by Review Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
if "brand" in cleaned.columns:
    top_brands = combined_cleaned_df[combined_cleaned_df["brand"] != "Unknown"]["brand"].value_counts().compute().head(10)
    top_brands.plot(kind="bar")
    plt.xlabel("Brand")
    plt.ylabel("Review Count")
    plt.title("Top 10 Brands by Review Count")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
if "year" in combined_cleaned_df.columns:
    yearly_avg = combined_cleaned_df.groupby("year")["rating"].mean().compute()
    yearly_avg.plot(kind="line", marker='o')
    plt.xlabel("Year")
    plt.ylabel("Average Rating")
    plt.title("Average Rating Over Time")
    plt.grid(True)
    plt.show()

In [None]:
if "review_length" in combined_cleaned_df.columns:
    corr = combined_cleaned_df[["review_length", "rating"]].corr().compute()
    print(f"Pearson correlation between review length and rating: {corr.loc['review_length', 'rating']:.4f}")