In [1]:
import duckdb
import os
import pyarrow.parquet as pq

def convert_file(file_path, output_folder, output_format='duckdb', file_name=None, table_name=None, json_options=None, csv_options=None):
    try:
        # Ensure output folder exists
        os.makedirs(output_folder, exist_ok=True)
        
        # Determine file extension and output file paths
        file_ext = os.path.splitext(file_path)[1].lower()
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        file_name = file_name or f"{base_name}.{output_format}"
        output_file_path = os.path.join(output_folder, file_name)
        table_name = table_name or base_name

        # Use an on-disk DuckDB file to minimize memory usage
        duckdb_file_path = os.path.join(output_folder, f"{base_name}.duckdb")
        conn = duckdb.connect(database=duckdb_file_path)

        # Default CSV/JSON options if not provided
        csv_options = csv_options or {}
        json_options = json_options or {}

        # Handling CSV files
        if file_ext == '.csv':
            # Apply CSV options
            sample_size = csv_options.get('sample_size', 1000000)
            dateformat = csv_options.get('dateformat', '%m/%d/%Y')
            all_varchar = csv_options.get('all_varchar', False)
            column_types = csv_options.get('types', {})

            # Create a table by reading the CSV file in a streaming way
            conn.execute(f"""
                CREATE TABLE {table_name} AS 
                SELECT * FROM read_csv_auto('{file_path}', sample_size={sample_size}, 
                dateformat='{dateformat}', all_varchar={all_varchar}, types={column_types})
            """)

        # Handling Parquet files
        elif file_ext == '.parquet':
            # Create a table from the Parquet file
            conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}')")

        # Handling JSON files
        elif file_ext == '.json':
            format_type = json_options.get('format', 'auto')
            records = json_options.get('records', 'auto')

            # Create a table from the JSON file
            conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM read_json('{file_path}', format='{format_type}', records='{records}')")

        # Export to the desired format using DuckDB's streaming COPY command
        if output_format == 'parquet':
            # Stream the table to a Parquet file using DuckDB's COPY
            conn.execute(f"COPY (SELECT * FROM {table_name}) TO '{output_file_path}' (FORMAT PARQUET)")

        elif output_format == 'csv':
            # Stream the table to a CSV file using DuckDB's COPY
            conn.execute(f"COPY (SELECT * FROM {table_name}) TO '{output_file_path}' (FORMAT CSV, HEADER)")

        elif output_format == 'json':
            # Stream the table to a JSON file (export using pandas for JSON support)
            df = conn.execute(f"SELECT * FROM {table_name}").fetchdf()
            df.to_json(output_file_path, orient='records', lines=True)

        elif output_format == 'duckdb':
            # The table is already stored in the DuckDB file, no need for further actions
            print(f"DuckDB file created at {duckdb_file_path}")

        print(f"File successfully converted to {output_format} at {output_file_path}")

    except Exception as e:
        print(f"An error occurred during file conversion: {e}")
        print("Troubleshooting steps:")
        print(f"1. Ensure that the file at {file_path} exists.")
        print(f"2. Check if the file formatting is correct.")
        print(f"3. Try adjusting the options like `format`, `records`, or column types.")
    finally:
        conn.close()




In [4]:
# Define the file path and output folder
file_path = '/home/christianocean/quiv/nfl.parquet'
output_folder = '/home/christianocean/evidencenfl/data'

# Specify custom file and table names
custom_file_name = 'nfl'
custom_table_name = 'pbp'

# Call the convert_file function with custom file and table names
convert_file(
    file_path=file_path, 
    output_folder=output_folder, 
    output_format='parquet', 
    file_name=custom_file_name,  # Specify the name for the DuckDB file
    table_name=custom_table_name  # Specify the name for the table
)


DuckDB file created at /home/christianocean/evidencenfl/data/nfl.duckdb
File successfully converted to duckdb at /home/christianocean/evidencenfl/data/nfl


In [3]:
import polars as pl
import pyarrow.parquet as pq

def process_in_chunks(parquet_file: str, date_columns: list, output_parquet_file: str, date_format: str = None, chunk_size: int = 10000):
    # Initialize an empty DataFrame to store results
    processed_df = pl.DataFrame()

    # Open the Parquet file using pyarrow
    parquet_reader = pq.ParquetFile(parquet_file)

    # Process file in chunks
    for batch in parquet_reader.iter_batches(batch_size=chunk_size):
        # Convert pyarrow Table to Polars DataFrame
        df_chunk = pl.from_arrow(batch)

        # Process the chunk
        for col in date_columns:
            if date_format:
                df_chunk = df_chunk.with_columns(
                    pl.col(col).str.strip_chars(" ").str.strptime(pl.Date, format=date_format, strict=False).alias(col)
                )
            else:
                df_chunk = df_chunk.with_columns(
                    pl.col(col).str.strip_chars(" ").str.strptime(pl.Date, strict=False).alias(col)
                )

        # Concatenate processed chunks to the main DataFrame
        processed_df = processed_df.vstack(df_chunk)

    # Write the final processed DataFrame to a new Parquet file
    processed_df.write_parquet(output_parquet_file)
    print(f"New Parquet file saved at: {output_parquet_file}")

# Path to the input Parquet file
input_parquet_file = '/home/christianocean/oceandatachallengesdemo/data/testing/DOB_Permit_Issuance.parquet'

# Path to the output Parquet file
output_parquet_file = '/home/christianocean/oceandatachallengesdemo/data/testing/DOB_Permit_Issuance_converted_chunks.parquet'

# Specify the date columns to cast
date_columns = ['Filing Date', 'Issuance Date', 'Expiration Date', 'Job Start Date']

# Optionally specify the date format if needed, or leave it as None for default handling
date_format = "%Y-%m-%d"

# Run the chunk-based cast and export process
process_in_chunks(input_parquet_file, date_columns, output_parquet_file, date_format, chunk_size=10000)


: 

In [1]:
import polars as pl
from rich.console import Console
from rich.table import Table

def analyze_parquet(file_path):
    # Load the Parquet file
    df = pl.read_parquet(file_path)

    # Create a console object for rich output
    console = Console()

    # Create a table to display the schema with enhanced colors
    table = Table(title="Parquet Schema", title_style="bold green")

    # Add columns for column names and data types with softer, harmonious colors
    table.add_column("Column Name", justify="left", style="bold yellow", no_wrap=True)
    table.add_column("Data Type", justify="left", style="bold cyan")

    # Add rows to the table
    for col_name, col_dtype in df.schema.items():
        table.add_row(col_name, str(col_dtype), style="white on black")

    # Print the schema table
    console.print(table)

    # Print the number of rows
    console.print(f"[bold magenta]\nNumber of rows:[/] [bold white]{df.height}[/]")

# Example usage
file_path = '/home/christianocean/oceandatachallengesdemo/data/testing/DOB_Permit_Issuance.parquet'
analyze_parquet(file_path)
