# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [1]:
# %pip install pyspark
# %pip install findspark
# %pip install -q gdown
# %pip install pandas
# %pip install matplotlib
# %pip install seaborn
# %pip install pyarrow
# %pip install setuptools

In [None]:
# import pyspark
# from pyspark.sql import SparkSession
# from pyspark.sql import Row
# from pyspark.sql.types import *
# from pyspark.sql.functions import *
# from pyspark.sql import functions as F
# from pyspark.sql.window import Window
# # import `DenseVector`
# from pyspark.ml.linalg import DenseVector

# # import `StandardScaler`
# from pyspark.ml.feature import StandardScaler


# # sudo apt install python3-distutils 

In [21]:
from bigdata_a3_utils.py import *
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil
import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np

import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json

ModuleNotFoundError: No module named 'bigdata_a3_utils.py'; 'bigdata_a3_utils' is not a package

In [20]:
tar_folder = 'root/Data'
output_folder = 'root/output_folder'
os.makedirs(output_folder, exist_ok=True)

In [8]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")

In [None]:
def preprocess_category(review_tar_path, meta_tar_path, output_folder, category, batch_size=1000):
    temp_path = "Data/temp_extract"
    os.makedirs(output_folder, exist_ok=True)

    try:
        print("Extracting tar files...")
        extract_tar_bz2(review_tar_path, temp_path)
        extract_tar_bz2(meta_tar_path, temp_path)

        arrow_files = list(Path(temp_path).rglob("*.arrow"))
        print(f"Found {len(arrow_files)} Arrow files")

        if not arrow_files:
            print("No Arrow files found, skipping processing.")
            return

        for arrow_file in arrow_files:
            print(f"Processing: {arrow_file.name}")
            try:
                is_meta = "meta" in str(arrow_file).lower()
                folder_name = "meta" if is_meta else "reviews"

                pkl_output_path = os.path.join(output_folder, f"{folder_name}_pkl")
                os.makedirs(pkl_output_path, exist_ok=True)

                dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)
                batch = []
                seen_keys = set()
                batch_num = 0
                row_count = 0

                for i, row in enumerate(dataset):
                    if not row:
                        continue

                    if not is_meta:
                        key = (row.get("user_id"), row.get("asin"), row.get("text"))
                        if None in key:
                            print(f"Missing key fields in row {i}: {key}")
                            continue
                        if key in seen_keys:
                            continue
                        seen_keys.add(key)

                    batch.append(row)
                    row_count += 1

                    if len(batch) >= batch_size:
                        df = pd.DataFrame(batch)
                        output_file = os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl")
                        df.to_pickle(output_file)
                        print(f"Saved batch {batch_num} with {len(batch)} rows to {output_file}")
                        batch = []
                        batch_num += 1

                if batch:
                    df = pd.DataFrame(batch)
                    output_file = os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl")
                    df.to_pickle(output_file)
                    print(f"Saved final batch {batch_num} with {len(batch)} rows to {output_file}")

                print(f"Finished processing {arrow_file.name} with {row_count} total rows (after dedup)")

            except Exception as e:
                print(f"Error processing {arrow_file.name}: {e}")

    except Exception as e:
        print(f"Top-level error during preprocessing: {e}")

    finally:
        if os.path.exists(temp_path):
            shutil.rmtree(temp_path)
            print("🧹 Temp folder removed.")

    print("All done!")

Calling fn to preprocess for a category

Run for one

In [10]:
# preprocess_category(
#     r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_meta_All_Beauty.tar.bz2",
#     r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_review_All_Beauty.tar.bz2",
#     "output_folder", category="All_Beauty"
# )

Meta and Review parsing

In [11]:
def convert_to_meta_df(folder):
    df_m = []

    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".pkl"):
            try:
                file_path = os.path.join(folder, fname)
                meta_df = pd.read_pickle(file_path)
                print(f"{fname} loaded: shape = {meta_df.shape}")
                df_m.append(meta_df)
            except Exception as e:
                print(f"Error in {fname}:", e)

    if df_m:
        meta_df = pd.concat(df_m, ignore_index=True)
        print("All .pkl files loaded. Final shape:", meta_df.shape)
        
    print("Removed meta pkl folder")
    return meta_df
 

In [12]:
def convert_to_rev_df(folder):
    df_r = []
    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".pkl"):
            try:
                file_path = os.path.join(folder, fname)
                review_df = pd.read_pickle(file_path)
                print(f"{fname} loaded: shape = {review_df.shape}")
                df_r.append(review_df)
            except Exception as e:
                print(f"Error in {fname}:", e)

    if df_r:
        review_df = pd.concat(df_r, ignore_index=True)
        print("All .pkl files loaded. Final shape:", review_df.shape)
        
    print("Removed reviews pkl folder")
    return review_df


Merging

In [13]:
# merged_df = pd.merge(
#     review_df,
#     meta_df,
#     on="parent_asin",
#     how="inner"
# )
# merged_df

Clean data

Dealing with the brand

In [14]:
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

In [15]:
def clean_data(category, review_df, meta_df):
    output_dir = r"D:/UWI/Year 3/Sem 2/COMP3610-Big-Data/Assignments/Assignment#3/A3/datasets/output_folder/cleaned"
    os.makedirs(output_dir, exist_ok=True)
    
    print("Merging review and meta...")
    merged_df = pd.merge(review_df, meta_df, on="parent_asin", how="inner")
    print("Merged")

    print("Filtering invalid ratings...")
    merged_df = merged_df[merged_df["rating"].between(1.0, 5.0, inclusive="both")]

    print("Dropping empty review text...")
    merged = merged_df[merged_df["text"].notna() & (merged_df["text"].str.strip() != "")]

    print("Extracting brand from metadata...")
    merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)

    print("Removing duplicate reviews...")
    merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)

    print("Computing review length...")
    merged["review_length"] = merged["text"].str.split().apply(len)

    print("Extracting year from timestamp...")
    merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
    merged.to_pickle(output_file, compression="bz2")

    print(" All cleaning steps completed.")
    
    test = merged
    return test

In [16]:
# cleaned = clean_data()

Run for all

In [17]:
# categories = ["Clothing_Shoes_and_Jewelry"]
categories = ["All_Beauty"]

In [18]:
from pathlib import Path

for category in categories:
    base_path = r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets"
    meta_path = os.path.join(base_path, f"raw_meta_{category}.tar.bz2")
    review_path = os.path.join(base_path, f"raw_review_{category}.tar.bz2")

    # review pkled folder
    rev_pkl  = r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\Code\output_folder\reviews_pkl" # Make sure this is the folder with review .pkl batches
    meta_pkl = r"D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\Code\output_folder\meta_pkl"  # Make sure this is the folder with meta .pkl batches

    preprocess_category(meta_path, review_path, "output_folder", category)
    review_df = convert_to_rev_df(rev_pkl)
    meta_df = convert_to_meta_df(meta_pkl)
    cleaned = clean_data(category, review_df, meta_df)
    print(cleaned)
    del cleaned
    del meta_df
    del review_df

    # remove the review and meta pkl files that aren't compressed
    if os.path.exists(rev_pkl):
        shutil.rmtree(rev_pkl)
    else:
        print(f"{rev_pkl} path does not exist")
    
    if os.path.exists(meta_pkl):
        shutil.rmtree(meta_pkl)
    else:
        print(f"{meta_pkl} path does not exist")


Extracting tar files...
Error: File D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_meta_All_Beauty.tar.bz2 does not exist.
Error: File D:\UWI\Year 3\Sem 2\COMP3610-Big-Data\Assignments\Assignment#3\A3\datasets\raw_review_All_Beauty.tar.bz2 does not exist.
Found 0 Arrow files
No Arrow files found, skipping processing.


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\UWI\\Year 3\\Sem 2\\COMP3610-Big-Data\\Assignments\\Assignment#3\\A3\\Code\\output_folder\\reviews_pkl'