In [None]:
import os
import pandas as pd
from datasets import load_dataset
from pathlib import Path
import gc

BASE_DIR = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\New folder"
RAW_DIR = os.path.join(BASE_DIR, "raw_files")
CLEANED_DIR = os.path.join(BASE_DIR, "cleaned_parquet")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(CLEANED_DIR, exist_ok=True)


In [None]:
import tarfile
import os

def extract_tar_bz2_files(directory):
    for file in os.listdir(directory):
        if file.endswith(".tar.bz2"):
            filepath = os.path.join(directory, file)
            extract_dir = os.path.join(directory, file.replace(".tar.bz2", ""))
            os.makedirs(extract_dir, exist_ok=True)

            print(f"Extracting: {file}")
            with tarfile.open(filepath, "r:bz2") as tar:
                tar.extractall(path=extract_dir)


In [None]:
RAW_DIR = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\New folder\raw_files"
extract_tar_bz2_files(RAW_DIR)


In [None]:
import pandas as pd
import re
import os

def clean_and_merge_streaming(category):
    review_path = os.path.join(RAW_DIR, f"raw_review_{category}", "data.jsonl")
    meta_path = os.path.join(RAW_DIR, f"raw_meta_{category}", "data.jsonl")

    try:
        meta_df = pd.read_json(meta_path, lines=True)
    except Exception as e:
        print(f"Meta load failed for {category}: {e}")
        return

    # Preprocess metadata
    meta_df["brand"] = meta_df["details"].apply(lambda d: d.get("brand") if isinstance(d, dict) else "Unknown")
    meta_df["brand"].fillna("Unknown", inplace=True)

    save_path = os.path.join(CLEANED_DIR, f"{category}.parquet")
    chunk_iter = pd.read_json(review_path, lines=True, chunksize=100_000)
    
    for i, chunk in enumerate(chunk_iter):
        print(f"[{category}] Processing chunk {i+1}")

        # Filter and clean
        chunk = chunk.dropna(subset=["rating", "text"])
        chunk = chunk[chunk["rating"].between(1, 5)]
        chunk = chunk.drop_duplicates(subset=["user_id", "asin", "text"])
        chunk["review_length"] = chunk["text"].apply(lambda x: len(re.findall(r'\w+', str(x))))
        chunk["year"] = pd.to_datetime(chunk["timestamp"], unit='s', errors='coerce').dt.year

        # Merge
        merged = pd.merge(chunk, meta_df, on="parent_asin", how="left")

        # Append to parquet
        if not os.path.exists(save_path):
            merged.to_parquet(save_path, index=False)
        else:
            merged.to_parquet(save_path, index=False, append=True)

    print(f"Saved {category} to {save_path}")


In [None]:
from datasets import load_from_disk
import os


def arrow_to_parquet(category):
    try:
        review_dict = load_from_disk(os.path.join(RAW_DIR, f"raw_review_{category}"))
        meta_dict = load_from_disk(os.path.join(RAW_DIR, f"raw_meta_{category}"))

        # Extract the 'full' split
        review_ds = review_dict["full"]
        meta_ds = meta_dict["full"]
    except Exception as e:
        print(f"Skipping {category} — failed to load arrow or 'full' split: {e}")
        return

    # Save to Parquet
    review_path = os.path.join(CLEANED_DIR, f"{category}_review.parquet")
    meta_path = os.path.join(CLEANED_DIR, f"{category}_meta.parquet")
    print(f"Exporting {category}...")

    review_ds.to_parquet(review_path)
    meta_ds.to_parquet(meta_path)

# Detect categories with both review and meta folders
# categories = [
#     name.replace("raw_review_", "")
#     for name in os.listdir(RAW_DIR)
#     if name.startswith("raw_review_")
#     and os.path.isdir(os.path.join(RAW_DIR, name))
#     and os.path.isdir(os.path.join(RAW_DIR, f"raw_meta_{name.replace('raw_review_', '')}"))
# ]

categories = ['Home_and_Kitchen','Industrial_and_Scientific']

for cat in categories:
    arrow_to_parquet(cat)


In [None]:
import duckdb

def merge_parquet_to_duckdb(category, cleaned_dir):
    review_path = os.path.join(cleaned_dir, f"{category}_review.parquet")
    meta_path = os.path.join(cleaned_dir, f"{category}_meta.parquet")
    output_path = os.path.join(cleaned_dir, f"{category}_merged.parquet")

    if not os.path.exists(review_path) or not os.path.exists(meta_path):
        print(f"⚠️ Skipping {category} — one of the parquet files is missing.")
        return

    print(f"🔄 Merging: {category}")

    con = duckdb.connect(database=os.path.join(cleaned_dir, "temp_duckdb.db"))

    con.execute(f"""
        CREATE OR REPLACE TABLE review AS SELECT * FROM '{review_path}';
    """)

    #load metadata parquet with brand extraction
    con.execute(f"""
        CREATE OR REPLACE TABLE meta AS
        SELECT *,
            CASE
                WHEN try_cast(details AS JSON) IS NOT NULL AND json_extract(details, '$.brand') IS NOT NULL
                THEN json_extract(details, '$.brand')::STRING
                ELSE 'Unknown'
            END AS brand
        FROM '{meta_path}';
    """)

    #clean and transform, without deduplication
    con.execute(f"""
        CREATE OR REPLACE TABLE cleaned AS
        SELECT
            r.user_id,
            r.asin,
            r.parent_asin,
            r.rating,
            r.text,
            r.verified_purchase,
            r.helpful_vote,
            array_length(string_split(r.text, ' ')) AS review_length,
            strftime('%Y',
                CASE 
                    WHEN typeof(r.timestamp) = 'VARCHAR' THEN CAST(r.timestamp AS TIMESTAMP)
                    ELSE CAST(to_timestamp(CAST(r.timestamp AS DOUBLE)) AS TIMESTAMP)
                END
            )::INTEGER AS year,
            m.brand,
            m.main_category,
            m.title,
            m.average_rating,
            m.rating_number,
            m.price,
            '{category}' AS category
        FROM review r
        LEFT JOIN meta m ON r.parent_asin = m.parent_asin
        WHERE r.rating BETWEEN 1 AND 5 AND r.text IS NOT NULL;
    """)

    #save result to parquet, dont forget to delete temp db
    con.execute(f"""
        COPY cleaned TO '{output_path}' (FORMAT PARQUET);
    """)

    con.close()
    print(f"Saved merged and cleaned data → {output_path}")


In [None]:
import os

CLEANED_DIR = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\New folder\raw_files\cleaned_parquet"

# Only look for categories with review + meta parquet files
def get_parquet_categories(cleaned_dir):
    review_files = set(
        f.replace("_review.parquet", "")
        for f in os.listdir(cleaned_dir) if f.endswith("_review.parquet")
    )
    meta_files = set(
        f.replace("_meta.parquet", "")
        for f in os.listdir(cleaned_dir) if f.endswith("_meta.parquet")
    )
    return sorted(review_files & meta_files)

categories = get_parquet_categories(CLEANED_DIR)
print("Found categories:", categories)


In [None]:
categories = ['Home_and_Kitchen', 'Industrial_and_Scientific']
for cat in categories:
    merge_parquet_to_duckdb(cat, CLEANED_DIR)
    gc.collect()
