In [None]:
import duckdb
import glob
import os

def sample_parquet_file(file_path, sample_pct=10):
    base_dir = os.path.dirname(file_path)
    filename = os.path.basename(file_path).replace("_merged.parquet", f"_sampled_{sample_pct}pct.parquet")

    sampled_dir = os.path.join(base_dir, "Sampled")
    os.makedirs(sampled_dir, exist_ok=True)

    output_path = os.path.join(sampled_dir, filename)

    temp_sample_table = "temp_sample"
    temp_unique_table = "temp_unique"
    print(f"Sampling {sample_pct}% from {os.path.basename(file_path)}")

    try:
        con = duckdb.connect()


        total_rows = con.sql(f"SELECT COUNT(*) FROM read_parquet('{file_path}')").fetchone()[0]
        sample_rows = int(total_rows * sample_pct / 100)


        con.execute(f"DROP TABLE IF EXISTS {temp_sample_table};")
        con.execute(f"""
            CREATE TABLE {temp_sample_table} AS
            SELECT * FROM read_parquet('{file_path}')
            USING SAMPLE RESERVOIR({sample_rows});
        """)

        con.execute(f"DROP TABLE IF EXISTS {temp_unique_table};")
        con.execute(f"""
            CREATE TABLE {temp_unique_table} AS
            SELECT DISTINCT * FROM {temp_sample_table};
        """)

        deduped_count = con.sql(f"SELECT COUNT(*) FROM {temp_unique_table}").fetchone()[0]
        num_duplicates = sample_rows - deduped_count

        if num_duplicates > 0:
            print(f"⚠️ {num_duplicates} duplicates found. Replenishing from original dataset...")


            con.execute(f"""
                INSERT INTO {temp_unique_table}
                SELECT * FROM (
                    SELECT * FROM read_parquet('{file_path}')
                    EXCEPT
                    SELECT * FROM {temp_unique_table}
                )
                USING SAMPLE RESERVOIR({num_duplicates});
            """)

        con.execute(f"COPY {temp_unique_table} TO '{output_path}' (FORMAT PARQUET);")
        con.close()

        final_count = deduped_count + num_duplicates
        print(f"Final sample saved → {output_path} ({final_count:,} rows)")

    except Exception as e:
        print(f"Failed on {os.path.basename(file_path)}: {e}")
        try:
            con.close()
        except:
            pass

In [None]:
import duckdb
import glob
import os

def sample_parquet_file(file_path, sample_pct=10):
    base_dir = os.path.dirname(file_path)
    filename = os.path.basename(file_path).replace("_merged.parquet", f"_sampled_{sample_pct}pct.parquet")

    sampled_dir = os.path.join(base_dir, "Sampled")
    os.makedirs(sampled_dir, exist_ok=True)

    output_path = os.path.join(sampled_dir, filename)

    print(f"📦 Sampling {sample_pct}% from {os.path.basename(file_path)}")

    try:
        con = duckdb.connect()

        total_rows = con.sql(f"SELECT COUNT(*) FROM read_parquet('{file_path}')").fetchone()[0]
        sample_rows = int(total_rows * sample_pct / 100)

        con.execute(f"""
            COPY (
                SELECT * FROM read_parquet('{file_path}')
                USING SAMPLE RESERVOIR({sample_rows})
            ) TO '{output_path}' (FORMAT PARQUET);
        """)

        con.close()
        print(f"Sample saved → {output_path} ({sample_rows:,} rows)")

    except Exception as e:
        print(f"Failed on {os.path.basename(file_path)}: {e}")
        try:
            con.close()
        except:
            pass

In [None]:
CLEANED_DIR = "/root/Merged-Files"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

for file in all_files:
    sample_parquet_file(file, sample_pct=10)