In [3]:
import os
import pandas as pd
from datasets import load_dataset
from pathlib import Path
import gc

base_directory = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\Code"
raw_files_dir = os.path.join(base_directory, "raw_files")
cleaned_pars = os.path.join(base_directory, "raw_files","cleaned_parquet")
os.makedirs(raw_files_dir, exist_ok=True)
os.makedirs(cleaned_pars, exist_ok=True)


In [2]:
import tarfile
import os

def extract_tar_bz2_files(directory):
    for file in os.listdir(directory):
        if file.endswith(".tar.bz2"):
            filepath = os.path.join(directory, file)
            extract_dir = os.path.join(directory, file.replace(".tar.bz2", ""))
            os.makedirs(extract_dir, exist_ok=True)

            print(f"Extracting: {file}")
            with tarfile.open(filepath, "r:bz2") as tar:
                tar.extractall(path=extract_dir)


In [9]:
extract_tar_bz2_files(raw_files_dir)


Extracting: raw_meta_Gift_Cards.tar.bz2


  tar.extractall(path=extract_dir)


Extracting: raw_review_Gift_Cards.tar.bz2


KeyboardInterrupt: 

In [None]:
from datasets import load_from_disk
import os


def arrow_to_parquet(category):
    try:
        review_dict = load_from_disk(os.path.join(raw_files_dir, f"raw_review_{category}"))
        meta_dict = load_from_disk(os.path.join(raw_files_dir, f"raw_meta_{category}"))

        # Extract the 'full' split
        review_ds = review_dict["full"]
        meta_ds = meta_dict["full"]
    except Exception as e:
        print(f"Skipping {category} — failed to load arrow or 'full' split: {e}")
        return

    # Save to Parquet
    r_folder = os.path.join(cleaned_pars, f"{category}_review.parquet")
    m_folder = os.path.join(cleaned_pars, f"{category}_meta.parquet")
    print(f"Exporting {category}...")

    review_ds.to_parquet(r_folder)
    meta_ds.to_parquet(m_folder)

#Detect categories with both review and meta folders
categories = [
    name.replace("raw_review_", "")
    for name in os.listdir(raw_files_dir)
    if name.startswith("raw_review_")
    and os.path.isdir(os.path.join(raw_files_dir, name))
    and os.path.isdir(os.path.join(raw_files_dir, f"raw_meta_{name.replace('raw_review_', '')}"))
]

categories = ['Gift_Cards']

for cat in categories:
    arrow_to_parquet(cat)


Exporting Gift_Cards...


Creating parquet from Arrow format:   0%|          | 0/153 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
import duckdb
import os

def merge_parquet_to_duckdb(category, cleaned_dir):
    review_path = os.path.join(cleaned_dir, f"{category}_review.parquet")
    meta_path = os.path.join(cleaned_dir, f"{category}_meta.parquet")
    output_path = os.path.join(cleaned_dir, f"{category}_merged.parquet")

    if not os.path.exists(review_path) or not os.path.exists(meta_path):
        print(f"Skipping {category} — one of the parquet files is missing.")
        return

    print(f"Merging: {category}")

    con = duckdb.connect(database=os.path.join(cleaned_dir, "temp_duckdb.db"))

    con.execute(f"CREATE OR REPLACE TABLE review AS SELECT * FROM '{review_path}';")

    con.execute(f"""
        CREATE OR REPLACE TABLE meta AS
        SELECT *,
            CASE
                WHEN try_cast(details AS JSON) IS NOT NULL AND json_extract(details, '$.brand') IS NOT NULL
                THEN json_extract(details, '$.brand')::STRING
                ELSE 'Unknown'
            END AS brand
        FROM '{meta_path}';
    """)

    # No deduplication
    con.execute(f"""
        CREATE OR REPLACE TABLE cleaned AS
        SELECT
            r.user_id,
            r.asin,
            r.parent_asin,
            r.rating,
            r.text,
            r.verified_purchase,
            r.helpful_vote,
            array_length(string_split(r.text, ' ')) AS review_length,
             strftime(
                    CASE 
                        WHEN typeof(r.timestamp) = 'VARCHAR' THEN CAST(r.timestamp AS TIMESTAMP)
                        ELSE to_timestamp(CAST(r.timestamp AS DOUBLE) / 1000.0)
                    END,
                    '%Y'
                )::INTEGER AS year,
            m.brand,
            m.main_category,
            m.title,
            m.average_rating,
            m.rating_number,
            m.price,
            '{category}' AS category
        FROM review r
        LEFT JOIN meta m ON r.parent_asin = m.parent_asin
        WHERE r.rating BETWEEN 1 AND 5 AND r.text IS NOT NULL;
    """)

    con.execute(f"COPY cleaned TO '{output_path}' (FORMAT PARQUET);")
    con.close()
    print(f"Saved merged and cleaned data → {output_path}")

In [5]:
import os


# Only look for categories with review + meta parquet files
def get_parquet_categories(cleaned_pars):
    review_files = set(
        f.replace("_review.parquet", "")
        for f in os.listdir(cleaned_pars) if f.endswith("_review.parquet")
    )
    meta_files = set(
        f.replace("_meta.parquet", "")
        for f in os.listdir(cleaned_pars) if f.endswith("_meta.parquet")
    )
    return sorted(review_files & meta_files)

categories = get_parquet_categories(cleaned_pars)
print("Found categories:", categories)


Found categories: ['Automotive', 'Grocery_and_Gourmet_Food', 'Handmade_Products', 'Health_and_Household', 'Health_and_Personal_Care', 'Home_and_Kitchen', 'Industrial_and_Scientific', 'Kindle_Store', 'Magazine_Subscriptions', 'Movies_and_TV', 'Musical_Instruments']


In [None]:
categories =['Home_and_Kitchen']
for cat in categories:
    merge_parquet_to_duckdb(cat, cleaned_pars)
    gc.collect()


🔄 Starting/Resuming merge for: Grocery_and_Gourmet_Food
📦 Writing chunk 1...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
def deduplicate_parquet(input_path, output_path):
    import duckdb

    con = duckdb.connect()
    print(f"Deduplicating: {input_path}")

    con.execute(f"""
        CREATE TABLE deduplicated AS
        SELECT * EXCLUDE(row_num)
        FROM (
            SELECT *,
                ROW_NUMBER() OVER (
                    PARTITION BY user_id, asin, text
                    ORDER BY year
                ) AS row_num
            FROM read_parquet('{input_path}')
        )
        WHERE row_num = 1;
    """)

    con.execute(f"COPY deduplicated TO '{output_path}' (FORMAT PARQUET);")
    con.close()
    print(f"Saved deduplicated data → {output_path}")


In [7]:
input = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\Code\raw_files\cleaned_parquet\temp_duckdb.db.tmp\Automotive_merged.parquet"
output = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\Code\raw_files\cleaned_parquet\Automotive_dedup.parquet"


deduplicate_parquet(input, output)

🧼 Deduplicating: C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\Code\raw_files\cleaned_parquet\temp_duckdb.db.tmp\Automotive_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Saved deduplicated data → C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\Code\raw_files\cleaned_parquet\Automotive_dedup.parquet


In [9]:
import duckdb

def count_parquet_rows(parquet_path):
    con = duckdb.connect()
    result = con.execute(f"SELECT COUNT(*) FROM read_parquet('{parquet_path}');").fetchone()[0]
    con.close()
    print(f"🧮 Total rows: {result}")
    return result

In [11]:
path1 = r"C:\Users\maian\OneDrive - The University of the West Indies, St. Augustine\Desktop\Code\raw_files\cleaned_parquet\temp_duckdb.db.tmp\Automotive_merged.parquet"

count_parquet_rows(path1)


🧮 Total rows: 19955450


19955450