In [None]:
import os
import gc
import shutil
import time
import pandas as pd
import dask.dataframe as dd
from pathlib import Path
from datetime import datetime
from dask.diagnostics import ProgressBar
import uuid

# Your custom functions
from process_and_stream import preprocess_category, convert_to_dd, clean_data_dask

# Set paths
BASE_DIR = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3"
RAW_DIR = os.path.join(BASE_DIR, "raw_files")
OUTPUT_DIR = os.path.join(BASE_DIR, "output_folder")
CLEANED_DIR = os.path.join(BASE_DIR, "cleaned_data")
SAMPLES_DIR = os.path.join(BASE_DIR, "samples")
STATS_DIR = os.path.join(BASE_DIR, "stats")

# Ensure dirs exist
for d in [CLEANED_DIR, SAMPLES_DIR, STATS_DIR]:
    os.makedirs(d, exist_ok=True)

# Categories to process
categories = [
    "Musical_Instruments"
]

def log(msg):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")

def process_and_stream(category):
    review_tar = os.path.join(RAW_DIR, f"raw_review_{category}.tar.bz2")
    meta_tar = os.path.join(RAW_DIR, f"raw_meta_{category}.tar.bz2")

    try:
        log(f"🔄 Processing category: {category}")
        preprocess_category(review_tar, meta_tar, OUTPUT_DIR, category)

        review_df = convert_to_dd(os.path.join(OUTPUT_DIR, "reviews_parquet"), category)
        meta_df = convert_to_dd(os.path.join(OUTPUT_DIR, "meta_parquet"), category)

        if review_df is None or meta_df is None:
            log(f"⚠️ Skipped {category} due to missing review/meta")
            return

        cleaned = clean_data_dask(category, review_df, meta_df)

        start = time.time()
        cleaned = cleaned.repartition(npartitions=80)
        print(f"✅ Repartitioned in {time.time() - start:.2f} seconds")

        partition_dir = os.path.join(CLEANED_DIR, category)
        os.makedirs(partition_dir, exist_ok=True)

        # Clean out old files in the category folder
        for f in os.listdir(partition_dir):
            try:
                os.remove(os.path.join(partition_dir, f))
            except Exception as e:
                log(f"⚠️ Failed to delete old file {f}: {e}")

        # Write each partition as CSV
        with ProgressBar():
            for i in range(cleaned.npartitions):
                part = cleaned.get_partition(i)
                try:
                    part.to_csv(
                        os.path.join(partition_dir, f"part_{i}_*.csv"),
                        index=False, single_file=True
                    )
                    log(f"💾 Saved partition {i}")
                except Exception as e:          
                    log(f"⚠️ Failed to write partition {i} for {category}: {e}")

        log(f"✅ Done: {category}")

    except Exception as e:
        log(f"❌ ERROR in {category}: {str(e)}")

    finally:
        # Clean up temp dirs and memory
        for sub in ["reviews_parquet", "meta_parquet", "temp_extract"]:
            path = os.path.join(OUTPUT_DIR, sub)
            if os.path.exists(path):
                try:
                    shutil.rmtree(path)
                except Exception as e:
                    log(f"⚠️ Failed to delete {path}: {e}")
        gc.collect()

if __name__ == "__main__":
    for cat in categories:
        process_and_stream(cat)


[23:29:46] 🔄 Processing category: Musical_Instruments
Extracting tar files for Musical_Instruments...
Extracting C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3\raw_files\raw_review_Musical_Instruments.tar.bz2 to C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3\output_folder\temp_extract\Musical_Instruments
Extracting C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3\raw_files\raw_meta_Musical_Instruments.tar.bz2 to C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\big_data_a3\output_folder\temp_extract\Musical_Instruments
Found 5 Arrow files
Saved batch 0 (1000 rows)
Saved batch 1 (1000 rows)
Saved batch 2 (1000 rows)
Saved batch 3 (1000 rows)
Saved batch 4 (1000 rows)
Saved batch 5 (1000 rows)
Saved batch 6 (1000 rows)
Saved batch 7 (1000 rows)
Saved batch 8 (1000 rows)
Saved batch 9 (1000 rows)
Saved batch 10 (1000

In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt

full_df = dd.read_parquet("cleaned_data/*.parquet")

# Rating distribution
rating_counts = full_df["rating"].value_counts().compute().sort_index()
rating_counts.plot(kind="bar", title="Star Rating Distribution")
plt.xlabel("Star Rating")
plt.ylabel("Count")
plt.show()

# Top 10 categories
if "main_category" in full_df.columns:
    top_categories = full_df["main_category"].value_counts().nlargest(10).compute()
    top_categories.plot(kind="bar", title="Top 10 Categories by Review Count")

# Top 10 brands (excluding Unknown)
top_brands = full_df[full_df["brand"] != "Unknown"]["brand"].value_counts().nlargest(10).compute()
top_brands.plot(kind="bar", title="Top 10 Brands")

# Rating trend over time
if "year" in full_df.columns:
    avg_by_year = full_df.groupby("year")["rating"].mean().compute()
    avg_by_year.plot(marker="o", title="Average Star Rating by Year")

# Correlation: review length vs rating
if "review_length" in full_df.columns:
    correlation = full_df[["review_length", "rating"]].corr().compute()
    print("Correlation matrix:\n", correlation)
