In [1]:
import polars as pl

In [8]:
csv_files = [
    "allplayers",
    "batting",
    "discreps",
    "ejections",
    "fielding",
    "gameinfo",
    "pitching",
    "teamstats",
]

for csv_file in csv_files:
    df = pl.scan_csv(f"csvs/{csv_file}.csv", infer_schema_length=None)
    df.collect().write_parquet(f"parquets/{csv_file}.parquet")

In [9]:
# Speed test
import time

time_start = time.perf_counter()
pl.scan_csv("csvs/batting.csv", infer_schema_length=None).collect()
print(f"CSV: {time.perf_counter() - time_start:.2f}s")

time_start = time.perf_counter()
pl.scan_parquet("parquets/batting.parquet").collect()
print(f"Parquet: {time.perf_counter() - time_start:.2f}s")


CSV: 10.86s
Parquet: 0.20s


In [None]:
# Add the big  one (plays)
(
    pl.scan_csv("csvs/plays.csv", infer_schema_length=None)
    .sink_parquet("parquets/plays.parquet")
)


More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  .sink_parquet("parquets/plays.parquet")


In [None]:
# Partition the plays file by year
unique_years = (
    pl.scan_parquet("parquets/plays.parquet")
    .with_columns(pl.col("date").cast(pl.String).str.slice(0, 4).alias("year"))
    .select("year")
    .unique()
    .collect()
    .to_series()
    .to_list()
)

In [7]:
for year in unique_years:
    (
        pl.scan_parquet("parquets/plays.parquet")
        .filter(pl.col("date").cast(pl.String).str.slice(0, 4) == year)
        .collect()
        .write_parquet(f"parquets/plays/{year}.parquet")
    )