# Convert HDF5 to CSV

**Hardware:** n1-highmem-32, 32 vCPUs, 208 GB memory, 1TB SSD

In [1]:
import gc
import os

import dask.bag as db
import pandas as pd
from dask.diagnostics import ProgressBar

In [2]:
HDF5_DIR = '/data/hdf5'
CSV_DIR = '/data/csv'

In [3]:
def hdf5_to_csv(hdf5_file: str, csv_file: str)->None:
    assert csv_file.endswith('.csv')
    df = pd.read_hdf(hdf5_file)
    df['timestamp']=df['timestamp'].astype('int64') // int(1e6)
    if 'BitMEX' in hdf5_file:
        df.sort_values('timestamp', inplace=True, ignore_index=True)
    else:
        df['trade_id'] = df['trade_id'].astype('int64')
        df.sort_values('trade_id', inplace=True, ignore_index=True)

    df.to_csv(csv_file, index=False)
    del df
    gc.collect()

In [4]:
hdf5_to_csv('/data/hdf5/OKEx.Swap.XMR_USD.hdf5', '/data/csv/OKEx.Swap.XMR_USD.csv')

In [5]:
def convert_dir(hdf5_dir: str, csv_dir: str)->None:
    files = os.listdir(hdf5_dir)
    with ProgressBar():
        db.from_sequence(files).map(
            lambda file: hdf5_to_csv(os.path.join(hdf5_dir, file), 
                                     os.path.join(csv_dir, f'{file[0:-5]}.csv'))
        ).compute()

In [6]:
convert_dir(HDF5_DIR, CSV_DIR)

[########################################] | 100% Completed |  5min  5.5s
