In [1]:
import os
import pandas as pd
import fastavro

In [9]:
def csv_to_avro(input_csv, output_avro):
    # Check if the Avro file already exists
    if os.path.exists(output_avro):
        print(f"Avro file {os.path.basename(output_avro)} already exists. Skipped.")
        return

    # Read CSV data into a Pandas DataFrame
    df = pd.read_csv(input_csv)

    # Convert Pandas DataFrame to a list of records (dicts)
    records = df.to_dict(orient='records')

    # Manually create Avro schema based on the data structure
    avro_schema = {
        'type': 'record',
        'name': 'MyRecord',
        'fields': [
            {'name': 'order_id', 'type': ['null', 'string']},
            {'name': 'payment_sequential', 'type': ['null', 'int']},
            {'name': 'payment_type', 'type': ['null', 'string']},
            {'name': 'payment_installments', 'type': ['null', 'int']},
            {'name': 'payment_value', 'type': ['null', 'float']}
        ]
    }

    # Write records to Avro file
    with open(output_avro, 'wb') as avro_file:
        fastavro.writer(avro_file, avro_schema, records)

    print(f"Converted {os.path.basename(input_csv)} to {os.path.basename(output_avro)}")



In [10]:
# Example usage
input_csv_path = '../data/olist/olist_order_payments_dataset.csv'
output_avro_path = '../data/olist/olist_order_payments_dataset.avro'

csv_to_avro(input_csv_path, output_avro_path)

Converted olist_order_payments_dataset.csv to olist_order_payments_dataset.avro
