[Reference](https://towardsdatascience.com/stop-using-csvs-for-storage-here-are-the-top-5-alternatives-e3a7c9018de0)

In [1]:
# conda create --name file_formats python=3.8
# conda activate file_formats

# conda install -c conda forge numpy pandas fastavro pyarrow feather-format jupyter jupyterlab

In [2]:
# jupyter lab

In [4]:
pip install fastavro

Collecting fastavro
  Downloading fastavro-1.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 5.4 MB/s 
[?25hInstalling collected packages: fastavro
Successfully installed fastavro-1.4.5


In [9]:
pip install pyarrow



In [10]:
import numpy as np
import pandas as pd

import feather
import pickle
import pyarrow as pa
import pyarrow.orc as orc 
from fastavro import writer, reader, parse_schema

np.random.seed = 42
df_size = 10_000_000

df = pd.DataFrame({
    'a': np.random.rand(df_size),
    'b': np.random.rand(df_size),
    'c': np.random.rand(df_size),
    'd': np.random.rand(df_size),
    'e': np.random.rand(df_size)
})
df.head()

Unnamed: 0,a,b,c,d,e
0,0.594589,0.023753,0.119105,0.215425,0.35481
1,0.151257,0.904785,0.756177,0.665009,0.619944
2,0.948392,0.582541,0.145244,0.965627,0.262283
3,0.421178,0.36837,0.266514,0.768328,0.11612
4,0.756303,0.38864,0.281472,0.603944,0.811477


In [12]:
table = pa.Table.from_pandas(df, preserve_index=False)
orc.write_table(table, '10M.orc')

In [13]:
df = pd.read_orc('10M.orc')

In [14]:
# 1. Define the schema
schema = {
    'doc': 'Float data',
    'name': 'Data',
    'namespace': 'data',
    'type': 'record',
    'fields': [
        {'name': 'a', 'type': 'float'},
        {'name': 'b', 'type': 'float'},
        {'name': 'c', 'type': 'float'},
        {'name': 'd', 'type': 'float'},
        {'name': 'e', 'type': 'float'},
    ]
}
parsed_schema = parse_schema(schema)

# 2. Convert pd.DataFrame to records - list of dictionaries
records = df.to_dict('records')

# 3. Write to Avro file
with open('10M.avro', 'wb') as out:
    writer(out, parsed_schema, records)

In [15]:
# 1. List to store the records
avro_records = []

# 2. Read the Avro file
with open('10M.avro', 'rb') as fo:
    avro_reader = reader(fo)
    for record in avro_reader:
        avro_records.append(record)
        
# 3. Convert to pd.DataFrame
df = pd.DataFrame(avro_records)

In [16]:
df.to_parquet('10M.parquet')

In [17]:
df = pd.read_parquet('10M.parquet')

In [18]:
with open('10M.pkl', 'wb') as f:
    pickle.dump(df, f)

In [19]:
with open('10M.pkl', 'rb') as f:
    df = pickle.load(f)

In [20]:
feather.write_dataframe(df, '10M.feather')

In [21]:
df = feather.read_dataframe('10M.feather')