In [9]:
!pip install pandas pyarrow --quiet

In [14]:
import os
from pathlib import Path
import pandas as pd

# Paths (adjust if your data directory is different)
input_path = Path('../01-docker-terraform/data/green_tripdata_2020-01.parquet')
output_path = Path('../01-docker-terraform/data/green_tripdata.csv')

# If the expected file doesn't exist, look for any .parquet in the data dir
if not input_path.exists():
    data_dir = input_path.parent
    available = sorted([p for p in data_dir.iterdir() if p.suffix == '.parquet'])
    if available:
        print(f'Expected file {input_path} not found. Using first available parquet: {available[0]}')
        input_path = available[0]
        # adjust default output name to match chosen input file
        output_path = data_dir / (input_path.stem + '.csv')
    else:
        raise FileNotFoundError(f'No parquet files found in {data_dir} - please place your parquet file there or update the path.')

# Prefer pyarrow parquet streaming by row-group (lower memory).
# Fall back to pandas.read_parquet if pyarrow is not available.
try:
    import pyarrow.parquet as pq
    pf = pq.ParquetFile(str(input_path))
    mode = 'w'
    for rg in range(pf.num_row_groups):
        table = pf.read_row_group(rg)
        df = table.to_pandas()
        # write header only for the first row-group
        df.to_csv(output_path, index=False, mode=mode, header=(mode == 'w'))
        mode = 'a'
    print(f'Converted {input_path} -> {output_path} using pyarrow (row-group streaming)')
except Exception as e:
    print('pyarrow conversion failed or unavailable, falling back to pandas.read_parquet:', e)
    # pandas will load the whole file into memory
    df = pd.read_parquet(input_path)
    df.to_csv(output_path, index=False)
    print(f'Converted {input_path} -> {output_path} using pandas')


Expected file ../01-docker-terraform/data/green_tripdata_2020-01.parquet not found. Using first available parquet: ../01-docker-terraform/data/green_tripdata_2025-11.parquet
Converted ../01-docker-terraform/data/green_tripdata_2025-11.parquet -> ../01-docker-terraform/data/green_tripdata_2025-11.csv using pyarrow (row-group streaming)
Converted ../01-docker-terraform/data/green_tripdata_2025-11.parquet -> ../01-docker-terraform/data/green_tripdata_2025-11.csv using pyarrow (row-group streaming)
