In [None]:
import duckdb
import glob
import os

def sample_parquet_file(file_path, sample_pct=10):
    output_path = file_path.replace("_merged.parquet", f"_sampled_{sample_pct}pct.parquet")
    temp_table = "temp_sample"
    print(f"Sampling {sample_pct}% from {os.path.basename(file_path)}")

    try:
        con = duckdb.connect()


        total_rows = con.sql(f"SELECT COUNT(*) FROM read_parquet('{file_path}')").fetchone()[0]
        sample_rows = int(total_rows * sample_pct / 100)


        con.execute(f"DROP TABLE IF EXISTS {temp_table};")
        con.execute(f"""
            CREATE TABLE {temp_table} AS
            SELECT *
            FROM read_parquet('{file_path}')
            USING SAMPLE RESERVOIR({sample_rows});
        """)


        con.execute(f"COPY {temp_table} TO '{output_path}' (FORMAT PARQUET);")

        con.close()
        print(f"‚úÖ Saved sample ‚Üí {output_path} ({sample_rows:,} rows)")
    except Exception as e:
        print(f"Failed on {os.path.basename(file_path)}: {e}")
        try:
            con.close()
        except:
            pass

In [7]:
CLEANED_DIR = "/root/Merged-Files"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

for file in all_files:
    sample_parquet_file(file, sample_pct=10)

üé≤ Sampling 10% from Video_Games_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Video_Games_sampled_10pct.parquet (457,096 rows)
üé≤ Sampling 10% from Amazon_Fashion_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Amazon_Fashion_sampled_10pct.parquet (247,569 rows)
üé≤ Sampling 10% from Arts_Crafts_and_Sewing_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Arts_Crafts_and_Sewing_sampled_10pct.parquet (887,638 rows)
üé≤ Sampling 10% from Home_and_Kitchen_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Home_and_Kitchen_sampled_10pct.parquet (6,661,025 rows)
üé≤ Sampling 10% from Baby_Products_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Baby_Products_sampled_10pct.parquet (596,795 rows)
üé≤ Sampling 10% from Unknown_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Unknown_sampled_10pct.parquet (6,381,397 rows)
üé≤ Sampling 10% from Electronics_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Electronics_sampled_10pct.parquet (4,337,427 rows)
üé≤ Sampling 10% from CDs_and_Vinyl_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/CDs_and_Vinyl_sampled_10pct.parquet (477,207 rows)
üé≤ Sampling 10% from Digital_Music_merged.parquet
‚úÖ Saved sample ‚Üí /root/Merged-Files/Digital_Music_sampled_10pct.parquet (12,876 rows)
üé≤ Sampling 10% from Patio_Lawn_and_Garden_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Patio_Lawn_and_Garden_sampled_10pct.parquet (1,631,814 rows)
üé≤ Sampling 10% from Office_Products_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Office_Products_sampled_10pct.parquet (1,271,509 rows)
üé≤ Sampling 10% from Clothing_Shoes_and_Jewelry_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Clothing_Shoes_and_Jewelry_sampled_10pct.parquet (6,603,334 rows)
üé≤ Sampling 10% from Health_and_Household_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Health_and_Household_sampled_10pct.parquet (2,537,070 rows)
üé≤ Sampling 10% from Pet_Supplies_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Pet_Supplies_sampled_10pct.parquet (1,667,302 rows)
üé≤ Sampling 10% from Gift_Cards_merged.parquet
‚úÖ Saved sample ‚Üí /root/Merged-Files/Gift_Cards_sampled_10pct.parquet (15,098 rows)
üé≤ Sampling 10% from All_Beauty_merged.parquet
‚úÖ Saved sample ‚Üí /root/Merged-Files/All_Beauty_sampled_10pct.parquet (69,425 rows)
üé≤ Sampling 10% from Cell_Phones_and_Accessories_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Cell_Phones_and_Accessories_sampled_10pct.parquet (2,057,638 rows)
üé≤ Sampling 10% from Toys_and_Games_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Toys_and_Games_sampled_10pct.parquet (1,608,635 rows)
üé≤ Sampling 10% from Books_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Books_sampled_10pct.parquet (2,947,544 rows)
üé≤ Sampling 10% from Appliances_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Saved sample ‚Üí /root/Merged-Files/Appliances_sampled_10pct.parquet (210,594 rows)


In [12]:
import duckdb
import glob
import os

def combine_parquet_files(CLEANED_DIR, SAMPLE_PERCENT):
    output_path = os.path.join(CLEANED_DIR, f"all_categories_sampled_{SAMPLE_PERCENT}pct.parquet")
    sampled_files = glob.glob(os.path.join(CLEANED_DIR, f"*sampled_{SAMPLE_PERCENT}pct.parquet"))

    print(f"üîó Combining {len(sampled_files)} sampled files...")

    try:
        con = duckdb.connect()

        # Step 1: Collect all unique column names
        all_columns = set()
        for file in sampled_files:
            cols = con.execute(f"DESCRIBE SELECT * FROM read_parquet('{file}')").fetchall()
            all_columns.update([col[0] for col in cols])

        all_columns = sorted(all_columns)

        # Step 2: Build SELECT statements with aligned columns
        select_stmts = []
        for file in sampled_files:
            cols = con.execute(f"DESCRIBE SELECT * FROM read_parquet('{file}')").fetchall()
            file_columns = [col[0] for col in cols]

            # Normalize the columns: use real column or NULL AS column
            select_exprs = [
                col if col in file_columns else f"NULL AS {col}" for col in all_columns
            ]

            select_stmt = f"SELECT {', '.join(select_exprs)} FROM read_parquet('{file}')"
            select_stmts.append(select_stmt)

        # Step 3: Combine and write to disk
        union_query = " UNION ALL ".join(select_stmts)
        con.execute(f"COPY ({union_query}) TO '{output_path}' (FORMAT PARQUET);")
        con.close()

        print(f"‚úÖ Combined file saved to {output_path}")

    except Exception as e:
        print(f"‚ùå Failed to combine: {e}")
        try:
            con.close()
        except:
            pass

In [None]:
# combine_parquet_files(CLEANED_DIR, SAMPLE_PERCENT=10)