In [1]:
import glob
import os
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
deduplicated_categories = [
    'Electronics',
    'Health_and_Houehold',
    'Home_and_Kitchen',
    'Office_Products',
    'Patio_Lawn_and_Garden',
    'Pet_Supplies',
    'Software',
    'Sports_and_Outdoors',
    'Subscription_Boxes',
    'Tools_and_Home_Improvement',
    'Toys_and_Games',
    'Video_Games',
]

In [3]:
# Sample 20% of rows
def sample_rows(input_path, sample_fraction=0.2):
    # Read the parquet file with duckdb or pandas and take a sample
    con = duckdb.connect()
    df = con.execute(f"SELECT * FROM read_parquet('{input_path}') LIMIT 1000000").df()  # Load a subset to sample
    con.close()
    return df.sample(frac=sample_fraction, random_state=42)

In [4]:
# Deduplication process using DuckDB
def deduplicate_parquet(input_path, output_path, dedup_columns, order_column='year'):
    con = duckdb.connect()

    # Assuming the parquet is already sampled (20% of the rows)
    con.execute("DROP TABLE IF EXISTS deduplicated;")
    dedup_key = ', '.join(dedup_columns)

    con.execute(f"""
        CREATE TABLE deduplicated AS
        SELECT * EXCLUDE(row_num)
        FROM (
            SELECT *,
                ROW_NUMBER() OVER (
                    PARTITION BY {dedup_key}
                    ORDER BY {order_column}
                ) AS row_num
            FROM read_parquet('{input_path}', union_by_name=true)
        )
        WHERE row_num = 1;
    """)

    con.execute(f"COPY deduplicated TO '{output_path}' (FORMAT PARQUET);")
    con.close()

    print(f"Deduplicated and saved to: {output_path}\n")

In [5]:
def process_mixed_files(input_dir, output_dir, dedup_columns, order_column='year', chunk_threshold_gb=2.0):
    os.makedirs(output_dir, exist_ok=True)
    all_parquets = glob.glob(os.path.join(input_dir, "*_merged.parquet"))

    print(f"Found {len(all_parquets)} '_merged' files to process...\n")

    for input_path in all_parquets:
        filename = os.path.basename(input_path)
        name_wo_ext = filename.replace("_merged.parquet", "")
        category_name = name_wo_ext  # Assuming the category is embedded in the filename
        output_path = os.path.join(output_dir, f"{name_wo_ext}_deduped.parquet")

        # Check if the category has already been deduplicated
        if category_name in deduplicated_categories:
            print(f"Skipping (already deduplicated): {filename}")
            # Simply read the file into a DuckDB table and write it out again (without processing)
            con = duckdb.connect()
            con.execute(f"""
                CREATE TABLE temp_table AS SELECT * FROM read_parquet('{input_path}');
            """)
            con.execute(f"COPY temp_table TO '{output_path}' (FORMAT PARQUET);")
            con.close()
            continue

        # Deduplicate only if not in the deduplicated list
        print(f"Deduplicating: {filename} (Category: {category_name})")
        file_size_gb = os.path.getsize(input_path) / (1024 ** 3)

        # Sample 20% of the rows before deduplication
        sampled_df = sample_rows(input_path)

        # Save the sampled rows as a temporary parquet file for processing
        sampled_input_path = input_path.replace(".parquet", "_sampled.parquet")
        sampled_df.to_parquet(sampled_input_path)

        # Deduplicate the sampled rows
        deduplicate_parquet(sampled_input_path, output_path, dedup_columns, order_column)

        # Clean up the sampled file
        os.remove(sampled_input_path)

        # After successful deduplication, add the category to the deduplicated list
        deduplicated_categories.append(category_name)

    print("🏁 All deduplication complete.")


In [6]:
deduplicated_categories = [
    'Electronics',
    'Health_and_Houehold',
    'Home_and_Kitchen',
    'Office_Products',
    'Patio_Lawn_and_Garden',
    'Pet_Supplies',
    'Software',
    'Sports_and_Outdoors',
    'Subscription_Boxes',
    'Tools_and_Home_Improvement',
    'Toys_and_Games',
    'Video_Games',
]

In [7]:
# musical_instruments_path = os.path.join(input_dir, "Musical_Instruments" \
# "_merged.parquet")

# # Sample 20% of the rows
# musical_instruments_df = sample_rows(musical_instruments_path, sample_fraction=0.2)

# print(musical_instruments_df.head())
# musical_instruments_df

In [None]:
input_dir = "/root/deduped_mix"
output_dir = "/root/cleaned_parquets"
dedup_columns = ['user_id', 'asin', 'text']  # Specify your deduplication columns
process_mixed_files(input_dir, output_dir, dedup_columns)

Found 33 '_merged' files to process...

Skipping (already deduplicated): Video_Games_merged.parquet
Deduplicating: Amazon_Fashion_merged.parquet (Category: Amazon_Fashion)
Deduplicated and saved to: /root/cleaned_parquets/Amazon_Fashion_deduped.parquet

Skipping (already deduplicated): Software_merged.parquet
Deduplicating: Health_and_Personal_Care_merged.parquet (Category: Health_and_Personal_Care)
Deduplicated and saved to: /root/cleaned_parquets/Health_and_Personal_Care_deduped.parquet

Deduplicating: Arts_Crafts_and_Sewing_merged.parquet (Category: Arts_Crafts_and_Sewing)
Deduplicated and saved to: /root/cleaned_parquets/Arts_Crafts_and_Sewing_deduped.parquet

Skipping (already deduplicated): Home_and_Kitchen_merged.parquet
