In [5]:
from pathlib import Path
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd

In [6]:
BASE = Path("/workspaces/projekt-datascience/data/parquet/bronze/usage")

parts = []
for city_dir in sorted(BASE.glob("city=*")):
    city = city_dir.name.split("=", 1)[1]
    for year_dir in sorted(city_dir.glob("year=*")):
        year = int(year_dir.name.split("=", 1)[1])
        for month_dir in sorted(year_dir.glob("month=*")):
            month = int(month_dir.name.split("=", 1)[1])
            parts.append((city, year, month, month_dir))

print(f"Gefundene Partitionen: {len(parts)}")

Gefundene Partitionen: 202


In [7]:
def head_parquet_partition(pdir, n=10):
    dset = ds.dataset(str(pdir), format="parquet")
    scanner = dset.scanner(batch_size=4096)

    batches, taken = [], 0
    for batch in scanner.to_batches():
        remain = n - taken
        if remain <= 0:
            break
        if batch.num_rows > remain:
            batch = batch.slice(0, remain)
        batches.append(batch)
        taken += batch.num_rows

    if not batches:
        return pd.DataFrame()

    tbl = pa.Table.from_batches(batches)
    return tbl.to_pandas()

for city, year, month, pdir in parts:
    print(f"\n===== {city} {year:04d}-{month:02d} =====")
    df_head = head_parquet_partition(pdir, n=10)
    if df_head.empty:
        print("(leer)")
    else:
        df_head = df_head.assign(city=city, year=year, month=month)
        print(df_head.to_string(index=False))


===== montreal 2014-10 =====
      start_date start_station_code         end_date end_station_code duration_sec is_member     city  year  month
2014-10-01 00:00               6236 2014-10-01 00:12             6302          762         1 montreal  2014     10
2014-10-01 00:00               6221 2014-10-01 00:06             6199          316         1 montreal  2014     10
2014-10-01 00:01               6097 2014-10-01 00:07             6063          338         1 montreal  2014     10
2014-10-01 00:01               6094 2014-10-01 00:10             6173          517         1 montreal  2014     10
2014-10-01 00:01               6410 2014-10-01 00:12             6210          614         1 montreal  2014     10
2014-10-01 00:05               6136 2014-10-01 00:16             6243          685         0 montreal  2014     10
2014-10-01 00:02               6280 2014-10-01 00:12             6204          619         1 montreal  2014     10
2014-10-01 00:02               6012 2014-10-01 00: