In [1]:
import polars as pl
import os
from io import StringIO

In [2]:
# CONFIG
INPUT_FILE = 'sales.csv'
OUTPUT_DIR = 'micro_partitions_polars'
MAX_FILE_SIZE_MB = 10
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
MAX_FILES = 10
CHUNK_SIZE = 1000  # how many rows to process per check

In [3]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
# Load and prepare data
print("📥 Loading CSV with Polars...")
df = pl.read_csv(INPUT_FILE)

# Convert 'SalesDate' from string to datetime
df = df.with_columns(
    pl.col("SalesDate").str.to_datetime().alias("SalesDate")
)

# Extract the date part
df = df.with_columns(
    pl.col("SalesDate").dt.date().alias("SalesDate")
)

df = df.sort("SalesDate")
total_rows = df.shape[0]

# Init
file_counter = 1
rows_processed = 0
buffer = pl.DataFrame()

print(f"🚀 Starting Polars split | Target: {MAX_FILES} files @ {MAX_FILE_SIZE_MB}MB max each\n")

# Process in chunks
for i in range(0, total_rows, CHUNK_SIZE):
    chunk = df.slice(i, CHUNK_SIZE)
    buffer = pl.concat([buffer, chunk])

    csv_str = buffer.write_csv()
    current_size = len(csv_str.encode("utf-8"))

    rows_processed += chunk.shape[0]
    print(
        f"🔄 Processed: {rows_processed:,}/{total_rows:,} | "
        f"Buffer Size: {round(current_size / 1024 / 1024, 2)} MB | "
        f"File: {file_counter}/{MAX_FILES}",
        end="\r"
    )

    if current_size >= MAX_FILE_SIZE_BYTES:
        output_path = os.path.join(OUTPUT_DIR, f"sales_partition_{file_counter}.csv")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(csv_str)
        print(f"\n✅ Written {output_path} | Size: {round(current_size / 1024 / 1024, 2)} MB")

        file_counter += 1
        buffer = pl.DataFrame()

        if file_counter > MAX_FILES:
            print("🛑 Max file limit reached.")
            break

# Write leftover rows if any
if buffer.height > 0 and file_counter <= MAX_FILES:
    final_path = os.path.join(OUTPUT_DIR, f"sales_partition_{file_counter}.csv")
    with open(final_path, "w", encoding="utf-8") as f:
        f.write(buffer.write_csv())
    print(f"\n✅ Final file written: {final_path} | Rows: {buffer.height}")

print("\n🎉 Done! Files split using Polars.")

📥 Loading CSV with Polars...
🚀 Starting Polars split | Target: 10 files @ 10MB max each

🔄 Processed: 179,000/6,758,125 | Buffer Size: 10.06 MB | File: 1/10
✅ Written micro_partitions_polars\sales_partition_1.csv | Size: 10.06 MB
🔄 Processed: 347,000/6,758,125 | Buffer Size: 10.05 MB | File: 2/10
✅ Written micro_partitions_polars\sales_partition_2.csv | Size: 10.05 MB
🔄 Processed: 515,000/6,758,125 | Buffer Size: 10.05 MB | File: 3/10
✅ Written micro_partitions_polars\sales_partition_3.csv | Size: 10.05 MB
🔄 Processed: 683,000/6,758,125 | Buffer Size: 10.05 MB | File: 4/10
✅ Written micro_partitions_polars\sales_partition_4.csv | Size: 10.05 MB
🔄 Processed: 851,000/6,758,125 | Buffer Size: 10.05 MB | File: 5/10
✅ Written micro_partitions_polars\sales_partition_5.csv | Size: 10.05 MB
🔄 Processed: 1,019,000/6,758,125 | Buffer Size: 10.05 MB | File: 6/10
✅ Written micro_partitions_polars\sales_partition_6.csv | Size: 10.05 MB
🔄 Processed: 1,187,000/6,758,125 | Buffer Size: 10.05 MB | File