### Imports

In [None]:

from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil
import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np
from datasets import load_dataset
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json

### Files/folders

In [None]:
reviews_path = "C:\\Big Data\\A3\\Data\\reviews" # path to all the raw review files
meta_path = "C:\\Big Data\\A3\\Data\\meta"       # path to all the meta review files

review_pkls_path = "C:\\Big Data\\A3\\Data\\review_pkl" #path of the review pkl files
meta_pkls_path = "C:\\Big Data\\A3\\Data\\meta_pkl"     #path of the meta pkl files

### Categories to process

In [None]:
categories = ['Unknown', 'Magazine_Subscriptions', 'Movies_and_TV', "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewellery", "Digital_Music", "Hanmade_Products", "Baby_Products", "Beauty_and_Personal_Care", "Electronics"] # These are the ones that we have left to run

### Load Files to disk

In [None]:
def load_compressed_dataset(compressed_path: Union[str, Path], extract_dir: Optional[Union[str, Path]] = None, cleanup_after_load: bool = True) -> Union[Dataset, DatasetDict]:
    """
    Load a dataset from a compressed archive (tar.gz, tar.bz2, or tar.xz).

    Args:
        compressed_path: Path to the compressed dataset file
        extract_dir: Directory to extract files to (defaults to a temporary directory)
        cleanup_after_load: Whether to delete the extracted files after loading
                           (only applies to auto-generated temp directories)

    Returns:
        The loaded dataset (Dataset or DatasetDict)

    Raises:
        ValueError: If the file doesn't exist or isn't a supported compressed file
    """
    compressed_path = Path(compressed_path)

    if not compressed_path.exists():
        raise ValueError(f"File not found: {compressed_path}")

    # check file extension
    valid_extensions = [".tar.gz", ".tar.bz2", ".tar.xz"]
    is_valid = False

    for ext in valid_extensions:
        if compressed_path.name.endswith(ext):
            is_valid = True
            break

    if not is_valid:
        raise ValueError(f"Expected a compressed tar file (.tar.gz, .tar.bz2, or .tar.xz), got: {compressed_path}")

    # get the expected directory name (remove the extension)
    dir_name = compressed_path.name
    for ext in valid_extensions:
        if dir_name.endswith(ext):
            dir_name = dir_name[:-len(ext)]
            break

    # create extraction directory
    is_temp_dir = extract_dir is None
    if is_temp_dir:
        extract_dir = compressed_path.parent / f"temp_{uuid.uuid4().hex}"
    else:
        extract_dir = Path(extract_dir)

    extract_dir.mkdir(parents=True, exist_ok=True)

    try:
        # extract archive
        print(f"Extracting {compressed_path} to {extract_dir}...")
        with tarfile.open(compressed_path, "r:*") as tar:
            tar.extractall(path=extract_dir)

        # dataset should be in a subdirectory matching the original directory name
        dataset_dir = extract_dir / dir_name

        if not dataset_dir.exists():
            # try to find any directory
            extracted_folders = [f for f in extract_dir.iterdir() if f.is_dir()]
            if not extracted_folders:
                raise ValueError(f"No folders found in extracted archive: {compressed_path}")
            dataset_dir = extracted_folders[0]
            print(f"Using extracted directory: {dataset_dir}")

        # load dataset
        print(f"Loading dataset from {dataset_dir}...")
        dataset = load_from_disk(str(dataset_dir))

        return dataset

    finally:
        # clean up only if it's a temporary directory we created AND cleanup is requested
        if cleanup_after_load and is_temp_dir and extract_dir.exists():
            print(f"Cleaning up temporary directory: {extract_dir}")
            shutil.rmtree(extract_dir)

In [None]:
def load_raw_rev_and_meta(review_path, meta_path):
    for file in review_path:
        print(f"{review_path} loading")
        print(f"{meta_path} loading")

In [None]:
# This function handles the extraction of the brands
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

### Clean categories

In [None]:
# this function merges the review and meta data dataframes, cleans them 
# and returns the datframe made to ensure that it was put together and contains data
def clean_data(category, review_df, meta_df):
    output_dir = r"D:/UWI/Year 3/Sem 2/COMP3610-Big-Data/Assignments/Assignment#3/A3/datasets/output_folder/cleaned"
    os.makedirs(output_dir, exist_ok=True)
    
    print("Merging review and meta...")
    merged_df = pd.merge(review_df, meta_df, on="parent_asin", how="inner")
    print("Merged")

    print("Filtering invalid ratings...")
    merged_df = merged_df[merged_df["rating"].between(1.0, 5.0, inclusive="both")]

    print("Dropping empty review text...")
    merged = merged_df[merged_df["text"].notna() & (merged_df["text"].str.strip() != "")]

    print("Extracting brand from metadata...")
    merged["brand"] = merged.apply(lambda row: extract_brand(row.get("details"), row.get("store")), axis=1)

    print("Removing duplicate reviews...")
    merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)

    print("Computing review length...")
    merged["review_length"] = merged["text"].str.split().apply(len)

    print("Extracting year from timestamp...")
    merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
    merged.to_pickle(output_file, compression="bz2")

    print(" All cleaning steps completed.")
    
    test = merged
    return test

### Process Categories

In [None]:
# For loop to iterate the categories, clean them and convert to compressed pkl zips
# also removes the uncompressed files from the system once they've been done
for category in categories:
    base_path = r"" # replace with path to tar files
    meta_path = r"" # replace with path to meta pkl files
    review_path = r"" # replace with path to review pkl files

    # review pkled folder
    rev_pkl  = r"/root/Data/output_folder musical-video_games/reviews_pkl" # Make sure this is the folder with review .pkl batches
    meta_pkl = r"/root/Data/output_folder musical-video_games/meta_pkl"  # Make sure this is the folder with meta .pkl batches

    
    load_compressed_dataset()

    review_df = convert_to_df(review_path, category)
    meta_df = convert_to_df(meta_path, category)
    cleaned = clean_data(category, review_df, meta_df)
    print(cleaned)
    del cleaned
    del meta_df
    del review_df

    # remove the review and meta pkl files that aren't compressed
    if os.path.exists(rev_pkl):
        shutil.rmtree(rev_pkl)
    else:
        print(f"{rev_pkl} path does not exist")

    if os.path.exists(meta_pkl):
        shutil.rmtree(meta_pkl)
    else:
        print(f"{meta_pkl} path does not exist")