In [6]:
import duckdb
import glob
import os

def sample_parquet_file(file_path, sample_pct=10):
    
    base_dir = os.path.dirname(file_path)
    filename = os.path.basename(file_path).replace("_merged.parquet", f"_sampled_{sample_pct}pct.parquet")

    sampled_dir = os.path.join(base_dir, "Sampled")
    os.makedirs(sampled_dir, exist_ok=True)

    output_path = os.path.join(sampled_dir, filename)

    temp_table = "temp_sample"
    print(f"Sampling {sample_pct}% from {os.path.basename(file_path)}")

    try:
        con = duckdb.connect()


        total_rows = con.sql(f"SELECT COUNT(*) FROM read_parquet('{file_path}')").fetchone()[0]
        sample_rows = int(total_rows * sample_pct / 100)


        con.execute(f"DROP TABLE IF EXISTS {temp_table};")
        con.execute(f"""
            CREATE TABLE {temp_table} AS
            SELECT *
            FROM read_parquet('{file_path}')
            USING SAMPLE RESERVOIR({sample_rows});
        """)


        con.execute(f"COPY {temp_table} TO '{output_path}' (FORMAT PARQUET);")

        con.close()
        print(f"✅ Saved sample → {output_path} ({sample_rows:,} rows)")
    except Exception as e:
        print(f"Failed on {os.path.basename(file_path)}: {e}")
        try:
            con.close()
        except:
            pass

In [7]:
CLEANED_DIR = "/root/test"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

for file in all_files:
    sample_parquet_file(file, sample_pct=10)

Sampling 10% from Video_Games_merged.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

✅ Saved sample → /root/test/Sampled/Video_Games_sampled_10pct.parquet (457,096 rows)


In [12]:
import duckdb
import glob
import os

def combine_parquet_files(CLEANED_DIR, SAMPLE_PERCENT):
    output_path = os.path.join(CLEANED_DIR, f"all_categories_sampled_{SAMPLE_PERCENT}pct.parquet")
    sampled_files = glob.glob(os.path.join(CLEANED_DIR, f"*sampled_{SAMPLE_PERCENT}pct.parquet"))

    print(f"🔗 Combining {len(sampled_files)} sampled files...")

    try:
        con = duckdb.connect()

        # Step 1: Collect all unique column names
        all_columns = set()
        for file in sampled_files:
            cols = con.execute(f"DESCRIBE SELECT * FROM read_parquet('{file}')").fetchall()
            all_columns.update([col[0] for col in cols])

        all_columns = sorted(all_columns)

        # Step 2: Build SELECT statements with aligned columns
        select_stmts = []
        for file in sampled_files:
            cols = con.execute(f"DESCRIBE SELECT * FROM read_parquet('{file}')").fetchall()
            file_columns = [col[0] for col in cols]

            # Normalize the columns: use real column or NULL AS column
            select_exprs = [
                col if col in file_columns else f"NULL AS {col}" for col in all_columns
            ]

            select_stmt = f"SELECT {', '.join(select_exprs)} FROM read_parquet('{file}')"
            select_stmts.append(select_stmt)

        # Step 3: Combine and write to disk
        union_query = " UNION ALL ".join(select_stmts)
        con.execute(f"COPY ({union_query}) TO '{output_path}' (FORMAT PARQUET);")
        con.close()

        print(f"✅ Combined file saved to {output_path}")

    except Exception as e:
        print(f"❌ Failed to combine: {e}")
        try:
            con.close()
        except:
            pass

In [None]:
# combine_parquet_files(CLEANED_DIR, SAMPLE_PERCENT=10)