### Imports

In [None]:
# %pip install pydirectory

In [None]:
from bigdata_a3_utils import *
from pathlib import Path
import os
import tarfile
import pandas as pd
import shutil
import time, matplotlib.pyplot as plt, seaborn as sns, matplotlib.ticker as ticker
import numpy as np
from datasets import load_dataset, load_from_disk
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import json
import dask.dataframe as dd

### Files/folders

In [None]:
raw_files_path = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3\raw_files"
extraction_path = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3\output_folder\temp_extract"

# review_pkls_path = "C:\\Big Data\\A3\\Data\\review_pkl" #path of the review pkl files
# meta_pkls_path = "C:\\Big Data\\A3\\Data\\meta_pkl"     #path of the meta pkl files

### Categories to process

In [None]:
# categories = ['Unknown', 'Magazine_Subscriptions', 'Movies_and_TV', "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewellery", "Digital_Music", "Hanmade_Products", "Baby_Products", "Beauty_and_Personal_Care", "Electronics"] # These are the ones that we have left to run

In [None]:
# This function handles the extraction of the brands
def extract_brand(details, store):
    try:
        if isinstance(details, dict) and "brand" in details and details["brand"]:
            return details["brand"]
    except Exception:
        pass
    if isinstance(store, str) and store.strip():
        return store
    return "Unknown"

### Clean categories

In [None]:
def clean_data_dask(category, review_df, meta_df):
    output_dir = r"C:\Users\maian\Downloads\cleaned files"
    os.makedirs(output_dir, exist_ok=True)

    # print("Reading parquet files as Dask DataFrames")
    review_df = dd.from_pandas(review_df, npartitions=4)
    meta_df = dd.from_pandas(meta_df, npartitions=4)

    print("Merging review and meta on 'parent_asin'")
    merged = dd.merge(review_df, meta_df, on="parent_asin", how="left")

    if "rating" in merged.columns:
        merged = merged[merged["rating"].between(1, 5)]
    if "text" in merged.columns:
        merged = merged[merged["text"].notnull() & (merged["text"].str.strip() != "")]

    def safe_extract_brand(row):
        try:
            return extract_brand(row.get("details"), row.get("store"))
        except:
            return "Unknown"

    print("Extracting brand...")
    merged["brand"] = merged.map_partitions(
        lambda df: df.apply(lambda row: safe_extract_brand(row), axis=1),
        meta=("brand", "object")
    )
    merged["brand"] = merged["brand"].fillna("Unknown")
    print("Dropping duplicates...")
    merged = merged.drop_duplicates(subset=["user_id", "asin", "text"])

    if "text" in merged.columns:
        merged["review_length"] = merged["text"].str.split().map(lambda x: len(x) if x else 0, meta=("review_length", "int"))
    if "timestamp" in merged.columns:
        merged["year"] = dd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

    necessary_columns = [
        "user_id", "asin", "parent_asin", "rating", "text", "verified_purchase",
        "helpful_vote", "review_length", "year", "brand", "main_category",
        "title", "average_rating", "rating_number", "price"
    ]
    merged = merged[[col for col in necessary_columns if col in merged.columns]]

    output_file = os.path.join(output_dir, f"{category}_cleaned.parquet")
    merged.to_parquet(output_file, compression="snappy", write_index=False)
    print(f"Cleaned data saved to {output_file}")
    return merged


## Define Categories that will be cleaned

In [None]:
categories = [
    "Grocery_and_Gourmet_Food",
    "Handmade_Products",
    "Health_and_Household",
    "Home_and_Kitchen",
    "Industrial_and_Scientific",
    "Kindle_Store",
    "Magazine_Subscriptions",
    "Movies_and_TV",
    "Musical_Instruments"
]



In [None]:
# def clean_data(category, review_df, meta_df):

#     output_dir = r"D:\AS3\pkl_dir"
#     os.makedirs(output_dir, exist_ok=True)

#     print("=== Merging review and meta on 'parent_asin' (LEFT JOIN) ===")
#     merged = pd.merge(review_df, meta_df, on="parent_asin", how="left")
#     print(f"After merge: {len(merged)} rows")

#     # invalid ratings
#     if "rating" in merged.columns:
#         before = len(merged)
#         merged = merged[merged["rating"].between(1.0, 5.0, inclusive="both")]
#         print(f"After filtering invalid ratings: {len(merged)} rows (dropped {before - len(merged)})")
    
#     # empty review texts
#     if "text" in merged.columns:
#         before = len(merged)
#         merged = merged[merged["text"].notna() & (merged["text"].str.strip() != "")]
#         print(f"After dropping empty text: {len(merged)} rows (dropped {before - len(merged)})")

#     #  brand
#     print("Extracting brand from metadata...")
#     merged["brand"] = merged.apply(
#         lambda row: extract_brand(row.get("details"), row.get("store")), axis=1
#     )
#     merged["brand"].fillna("Unknown", inplace=True)

#     # duplicates
#     before = len(merged)
#     merged.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)
#     print(f"After removing duplicates: {len(merged)} rows (dropped {before - len(merged)})")

#     # derived columns
#     if "text" in merged.columns:
#         merged["review_length"] = merged["text"].str.split().apply(len)

#     if "timestamp" in merged.columns:
#         merged["year"] = pd.to_datetime(merged["timestamp"], unit="ms", errors="coerce").dt.year

#     # cleaned data
#     output_file = os.path.join(output_dir, f"{category}_cleaned_merged.pkl.bz2")
#     merged.to_pickle(output_file, compression="bz2")
#     print(f"Saved cleaned file to {output_file}")

#     return merged

### Process Categories

In [None]:
def load_raw_rev_and_meta(extraction_path, category):
    file_path = os.path.join(raw_files_path, f"raw_review_{category}.tar.bz2")
    print(f"Loading review: {file_path}")
    review_dataset = load_compressed_dataset(file_path, extraction_path)
    review_df = review_dataset["full"].to_pandas()
    review_df.to_parquet(os.path.join(extraction_path, f"{category}_review.parquet"))

    file_path = os.path.join(raw_files_path, f"raw_meta_{category}.tar.bz2")
    print(f"Loading meta: {file_path}")
    meta_dataset = load_compressed_dataset(file_path, extraction_path)
    meta_df = meta_dataset["full"].to_pandas()
    meta_df.to_parquet(os.path.join(extraction_path, f"{category}_meta.parquet"))

    return review_df, meta_df


In [None]:
categories = [ "Movies_and_TV",
    "Musical_Instruments"]

for category in categories:
    rev_df, meta_df = load_raw_rev_and_meta(extraction_path, category)
    clean_data_dask(category, rev_df, meta_df)


In [None]:
# # For loop to iterate the categories, clean them and convert to compressed pkl zips
# # also removes the uncompressed files from the system once they've been done
# for category in categories:
#     base_path = r"" # replace with path to tar files
#     meta_path = r"" # replace with path to meta pkl files
#     review_path = r"" # replace with path to review pkl files

#     # review pkled folder
#     rev_pkl  = r"/root/Data/output_folder musical-video_games/reviews_pkl" # Make sure this is the folder with review .pkl batches
#     meta_pkl = r"/root/Data/output_folder musical-video_games/meta_pkl"  # Make sure this is the folder with meta .pkl batches

    
#     load_compressed_dataset()

#     review_df = convert_to_df(review_path, category)
#     meta_df = convert_to_df(meta_path, category)
#     cleaned = clean_data(category, review_df, meta_df)
#     print(cleaned)
#     del cleaned
#     del meta_df
#     del review_df

#     # remove the review and meta pkl files that aren't compressed
#     if os.path.exists(rev_pkl):
#         shutil.rmtree(rev_pkl)
#     else:
#         print(f"{rev_pkl} path does not exist")

#     if os.path.exists(meta_pkl):
#         shutil.rmtree(meta_pkl)
#     else:
#         print(f"{meta_pkl} path does not exist")

In [None]:
# import os
# import tarfile
# import json

# def extract_tar_bz2(tar_path, extract_to):
#     with tarfile.open(tar_path, 'r:bz2') as tar:
#         tar.extractall(path=extract_to)

# def process_jsonl_file(file_path, output_path, clean_fn=None):
#     with open(file_path, 'r') as infile, open(output_path, 'w') as outfile:
#         for line in infile:
#             try:
#                 obj = json.loads(line)
#                 if clean_fn:
#                     obj = clean_fn(obj)
#                     if obj is None:
#                         continue
#                 outfile.write(json.dumps(obj) + '\n')
#             except json.JSONDecodeError:
#                 continue

# def basic_clean_review(obj):
#     # Example: skip entries without reviewText
#     if not obj.get("reviewText"):
#         return None
#     return obj

# def basic_clean_meta(obj):
#     # Example: skip entries with no title
#     if not obj.get("title"):
#         return None
#     return obj

# def process_category(extraction_path, category):
#     rev_tar = os.path.join(raw_files_path, f"raw_review_{category}.tar.bz2")
#     meta_tar = os.path.join(raw_files_path, f"raw_meta_{category}.tar.bz2")

#     rev_dir = os.path.join(extraction_path, f"{category}_reviews")
#     meta_dir = os.path.join(extraction_path, f"{category}_meta")
#     os.makedirs(rev_dir, exist_ok=True)
#     os.makedirs(meta_dir, exist_ok=True)

#     extract_tar_bz2(rev_tar, rev_dir)
#     extract_tar_bz2(meta_tar, meta_dir)

#     # Assume file inside tar is named {category}.json.gz or json
#     review_path = [f for f in os.listdir(rev_dir) if f.endswith('.json') or f.endswith('.jsonl')][0]
#     meta_path = [f for f in os.listdir(meta_dir) if f.endswith('.json') or f.endswith('.jsonl')][0]

#     cleaned_review_out = os.path.join(extraction_path, f"{category}_cleaned_reviews.jsonl")
#     cleaned_meta_out = os.path.join(extraction_path, f"{category}_cleaned_meta.jsonl")

#     process_jsonl_file(os.path.join(rev_dir, review_path), cleaned_review_out, clean_fn=basic_clean_review)
#     process_jsonl_file(os.path.join(meta_dir, meta_path), cleaned_meta_out, clean_fn=basic_clean_meta)

#     print(f"✅ Cleaned files saved for {category}")

# # Example usage
# for category in ["All_beauty", "Appliances"]:
#     process_category(extraction_path, category)
