In [1]:
import pandas as pd
import pyarrow.parquet as pq
from minio import Minio
import io
import os
from collections import defaultdict

In [2]:
minio_endpoint = os.getenv("MINIO_ENDPOINT", "localhost:9000")
minio_access_key = os.getenv("MINIO_ACCESS_KEY", "minio_access_key")
minio_secret_key = os.getenv("MINIO_SECRET_KEY", "minio_secret_key")
secure = os.getenv("MINIO_SECURE", "False").lower() == "true"
client = Minio(
    endpoint=minio_endpoint,
    access_key=minio_access_key,
    secret_key=minio_secret_key,
    secure=secure
)

In [None]:
bucket_name = "silver" 


objects = client.list_objects(bucket_name, recursive=True)


folder_files = defaultdict(list)

for obj in objects:
    if obj.object_name.endswith('.parquet'):
        
        folder_name = os.path.dirname(obj.object_name)
        if folder_name:
            folder_files[folder_name].append(obj.object_name)

print(f"Found {len(folder_files)} folders with Parquet files.")

Found 6 folders with Parquet files.


In [None]:
for folder, files in folder_files.items():
    print(f"\nProcessing folder: {folder}")
    folder_data = pd.DataFrame()
    
    for parquet_file in files:
        response = client.get_object(bucket_name, parquet_file)
        data = response.read()
        response.close()
        
        parquet_buffer = io.BytesIO(data)
        df = pq.read_table(parquet_buffer).to_pandas()
        
        folder_data = pd.concat([folder_data, df], ignore_index=True)
        print(f"Processed: {parquet_file}")
    
    csv_filename = f"{folder.replace('/', '_')}.csv"
    
    folder_data.to_csv(csv_filename, index=False)
    print(f"Saved {len(folder_data)} rows to {csv_filename}")

print("\nAll folders processed successfully!")


Processing folder: yellow_tripdata_2024-01
Processed: yellow_tripdata_2024-01/part-00001-0074987a-f48b-41e9-a464-bbf4648c7ab8-c000.snappy.parquet
Processed: yellow_tripdata_2024-01/part-00005-0074987a-f48b-41e9-a464-bbf4648c7ab8-c000.snappy.parquet
Processed: yellow_tripdata_2024-01/part-00009-0074987a-f48b-41e9-a464-bbf4648c7ab8-c000.snappy.parquet
Saved 2964624 rows to yellow_tripdata_2024-01.csv

Processing folder: yellow_tripdata_2024-02
Processed: yellow_tripdata_2024-02/part-00001-0d454169-579e-413a-b85a-aae2608dd1b1-c000.snappy.parquet
Processed: yellow_tripdata_2024-02/part-00005-0d454169-579e-413a-b85a-aae2608dd1b1-c000.snappy.parquet
Processed: yellow_tripdata_2024-02/part-00009-0d454169-579e-413a-b85a-aae2608dd1b1-c000.snappy.parquet
Saved 3007526 rows to yellow_tripdata_2024-02.csv

Processing folder: yellow_tripdata_2024-03
Processed: yellow_tripdata_2024-03/part-00001-9e451f7d-c885-4f1c-b16b-1186a1e071a7-c000.snappy.parquet
Processed: yellow_tripdata_2024-03/part-00004-9