# COMP 3610 – A3

- Zidane Timothy, Maia Neptune, Christophe Gittens

In [None]:
import findspark
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil

import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np


In [None]:
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json

## Function for extraction of RAW .tar files, creates .arrow files

In [None]:
def extract_tar_bz2(tar_path, extract_dir):
    if not os.path.exists(tar_path):
        print(f"Error: File {tar_path} does not exist.")
        return
    if not tar_path.endswith(".tar.bz2"):
        print(f"Error: File {tar_path} is not a .tar.bz2 file.")
        return

    try:
        with tarfile.open(tar_path, "r:bz2") as tar:
            print(f"Extracting {tar_path} to {extract_dir}")
            tar.extractall(path=extract_dir)
    except Exception as e:
        print(f"Error during extraction: {e}")

## Preprocess Category: Creates a temp path for management of Disk Storage <br>



In [None]:
import os
import shutil
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pandas as pd

def preprocess_category(review_tar_path, meta_tar_path, output_folder, category, batch_size=1000):
    # Define a temp path inside the output folder, unique per category
    temp_path = os.path.join(output_folder, "temp_extract", category)
    os.makedirs(temp_path, exist_ok=True)
    os.makedirs(output_folder, exist_ok=True)

    print(f"Extracting tar files for {category}...")

    # Extracts tar.bz2
    extract_tar_bz2(review_tar_path, temp_path)
    extract_tar_bz2(meta_tar_path, temp_path)

    arrow_files = list(Path(temp_path).rglob("*.arrow"))
    print(f"Found {len(arrow_files)} Arrow files in {category}")

    batch_num = 0
    total_rows = 0

    for arrow_file in arrow_files:
        try:
            is_meta = "meta" in str(arrow_file).lower()
            folder_name = "meta" if is_meta else "reviews"

            pkl_output_path = os.path.join(output_folder, f"{folder_name}_pkl")
            os.makedirs(pkl_output_path, exist_ok=True)

            dataset = load_dataset("arrow", data_files=str(arrow_file), split="train", streaming=True)

            batch = []
            seen_keys = set()
         

            for i, row in enumerate(dataset):
                if not row:
                    continue

                if not is_meta:
                    key = (row.get("user_id"), row.get("asin"), row.get("text"))
                    if key in seen_keys:
                        continue
                    seen_keys.add(key)

                batch.append(row)

                if len(batch) >= batch_size:
                    df = pd.DataFrame(batch)
                    df.to_pickle(os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl"))
                    print(f"Saved batch {batch_num} ({len(batch)} rows)")
                    batch = []
                    batch_num += 1
                    total_rows += 1

            # Final batch
            if batch:
                df = pd.DataFrame(batch)
                df.to_pickle(os.path.join(pkl_output_path, f"{category}_batch_{batch_num}.pkl"))
                print(f"Saved final batch {batch_num} ({len(batch)} rows)")

            print(f"Finished {arrow_file.name}: saved {total_rows} total rows in {batch_num + 1} batches")

        except Exception as e:
            print(f"Error processing {arrow_file.name}: {e}")

    # Cleanup
    if os.path.exists(temp_path):
        shutil.rmtree(temp_path)
        print(f"Temp folder removed: {temp_path}")


Meta and Review parsing

In [None]:
def convert_to_df(folder, category):
    df_r = []
    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".pkl") and category.lower() in fname.lower():
            try:
                file_path = os.path.join(folder, fname)
                review_df = pd.read_pickle(file_path)
                print(f"{fname} loaded: shape = {review_df.shape}")
                df_r.append(review_df)
            except Exception as e:
                print(f"Error in {fname}:", e)

    if df_r:
        review_df = pd.concat(df_r, ignore_index=True)
        print("All .pkl files loaded. Final shape:", review_df.shape)
        
    print("Removed reviews pkl folder")
    return review_df


Dealing with the brand

In [None]:
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

Clean data

In [None]:
def clean_data(category, review_df, meta_df):

    output_dir = r"C:\Users\maian\Downloads\cleaned files"
    os.makedirs(output_dir, exist_ok=True)

    print("=== Merging review and meta on 'parent_asin' (LEFT JOIN) ===")
    merged = pd.merge(review_df, meta_df, on="parent_asin", how="left")
    print(f"After merge: {len(merged)} rows")

    # invalid ratings
    if "rating" in merged.columns:
        before = len(merged)
        merged = merged[merged["rating"].between(1.0, 5.0, inclusive="both")]
        print(f"After filtering invalid ratings: {len(merged)} rows (dropped {before - len(merged)})")
    
    # empty review texts
    if "text" in merged.columns:
        before = len(merged)
        merged = merged[merged["text"].notna() & (merged["text"].str.strip() != "")]
        print(f"After dropping empty text: {len(merged)} rows (dropped {before - len(merged)})")

    #  brand
    print("Extracting brand from metadata...")
    merged["brand"] = merged.apply(
        lambda row: extract_brand(row.get("details"), row.get("store")), axis=1
    )
    merged["brand"].fillna("Unknown", inplace=True)

    # duplicates
    before = len(merged)
    merged.drop_duplicates(subset=["user_id", "product_id", "text"], keep="first", inplace=True)
    print(f"After removing duplicates: {len(merged)} rows (dropped {before - len(merged)})")

    # derived columns
    if "text" in merged.columns:
        merged["review_length"] = merged["text"].str.split().apply(len)

    if "timestamp" in merged.columns:
        merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    # cleaned data
    output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
    merged.to_pickle(output_file, compression="bz2")
    print(f"Saved cleaned file to {output_file}")

    return merged


## Define Categories that will be cleaned

In [None]:
categories = [
    "Grocery_and_Gourmet_Food",
    "Handmade_Products",
    "Health_and_Household",
    "Health_and_Personal_Care",
    "Home_and_Kitchen",
    "Industrial_and_Scientific",
    "Kindle_Store",
    "Magazine_Subscriptions",
    "Movies_and_TV",
    "Musical_Instruments"
]


## Running Preprocess then Clean_Data for the Categories defined above

In [None]:
import os
import shutil
import gc
from pathlib import Path

base_dir = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3"
raw_dir = os.path.join(base_dir, "raw_files")
output_dir = os.path.join(base_dir, "output_folder")

for category in categories:
    print(f"\n=== Processing category: {category} ===")

    # Point to the .tar files in raw_files/
    review_tar = os.path.join(raw_dir, f"raw_review_{category}.tar.bz2")
    meta_tar = os.path.join(raw_dir, f"raw_meta_{category}.tar.bz2")

    try:
        # Step 1: Extract and convert arrow to review/meta .pkl batches
        preprocess_category(review_tar, meta_tar, output_dir, category)

        # Step 2: Load the .pkl batches for this category
        review_df = convert_to_df(os.path.join(output_dir, "reviews_pkl"), category)
        meta_df = convert_to_df(os.path.join(output_dir, "meta_pkl"), category)

        # Step 3: Clean and save cleaned output
        cleaned = clean_data(category, review_df, meta_df)
        print(f"Cleaned shape: {cleaned.shape}")

    except Exception as e:
        print(f"Error while processing {category}: {e}")

    finally:
        # Step 4: Free memory
        for var in ['cleaned', 'review_df', 'meta_df']:
            if var in locals():
                del globals()[var]
        gc.collect()

        #Step 5: Clean up intermediate pkl folders
        for sub in ["reviews_pkl", "meta_pkl", "temp_extract"]:
            path = os.path.join(output_dir, sub)
            if os.path.exists(path):
                try:
                    shutil.rmtree(path)
                    print(f"Deleted: {path}")
                except Exception as e:
                    print(f"Couldn't delete {path}: {e}")
            else:
                print(f"Path does not exist: {path}")

