# Compare file formats - array

In [1]:
import os
import numpy as np
import pandas as pd

n = 1000

data_array = np.random.uniform(size=(n,n))

In [2]:
dataset_comparison = []

## CSV

Write speed:

In [3]:
%timeit -o np.savetxt('data_array.txt', data_array)

690 ms ± 73.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 690 ms ± 73.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [4]:
write_speed = 1000 * _.average

Read speed:

In [5]:
%timeit -o data_array_csv = np.loadtxt('data_array.txt')

294 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 294 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [6]:
read_speed =  1000 * _.average

Datasets match:

In [7]:
data_array_csv = np.loadtxt('data_array.txt')
data_array_match = np.all(data_array == data_array_csv)
data_array_match

True

Dataset size:

In [8]:
data_array_size = os.path.getsize('data_array.txt') / (1024. * 1024.)

Add data to comparison list:

In [9]:
dataset_comparison.append(['CSV', data_array_size, write_speed, read_speed, data_array_match])

## npy

Write speed:

In [10]:
%timeit -o np.save('data_array.npy', data_array)

13.8 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 13.8 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [11]:
write_speed =  1000 * _.average

Read speed:

In [12]:
%timeit -o data_array_npy = np.load('data_array.npy')

2.72 ms ± 80.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 2.72 ms ± 80.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [13]:
read_speed =  1000 * _.average

Datasets match:

In [14]:
data_array_npy = np.load('data_array.npy')
data_array_match = np.all(data_array == data_array_npy)
data_array_match

True

Dataset size:

In [15]:
data_array_size = os.path.getsize('data_array.npy') / (1024. * 1024.)

Add data to comparison list:

In [16]:
dataset_comparison.append(['npy', data_array_size, write_speed, read_speed, data_array_match])

## HDF5

In [17]:
import h5py

Write speed:

In [18]:
%%timeit -o

h5_file = h5py.File('data_array.h5', 'w')
h5_file.create_dataset('data_array', data=data_array)
h5_file.close()

27 ms ± 5.95 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 27 ms ± 5.95 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [19]:
write_speed =  1000 * _.average

Read speed:

In [20]:
%%timeit -o

h5_file = h5py.File('data_array.h5', 'r')
data_array_h5 = h5_file['data_array'][()]
h5_file.close()

3.97 ms ± 264 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 3.97 ms ± 264 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [21]:
read_speed =  1000 * _.average

Datasets match:

In [22]:
h5_file = h5py.File('data_array.h5', 'r')
data_array_h5 = h5_file['data_array'][()]
h5_file.close()
data_array_match = np.all(data_array == data_array_h5)
data_array_match

True

Dataset size:

In [23]:
data_array_size = os.path.getsize('data_array.h5') / (1024. * 1024.)

Add data to comparison list:

In [24]:
dataset_comparison.append(['HDF5', data_array_size, write_speed, read_speed, data_array_match])

## NetCDF4

In [25]:
import xarray as xr

Write speed:

In [26]:
%timeit -o xr.DataArray(data_array).to_netcdf('data_array.nc', engine='h5netcdf')

28.8 ms ± 7.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<TimeitResult : 28.8 ms ± 7.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)>

In [27]:
write_speed =  1000 * _.average

Read speed:

In [28]:
%timeit -o data_array_xarray = xr.open_dataarray('data_array.nc', engine='h5netcdf').to_numpy()

12.2 ms ± 379 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 12.2 ms ± 379 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [29]:
read_speed =  1000 * _.average

Datasets match:

In [30]:
data_array_xarray = xr.open_dataarray('data_array.nc', engine='h5netcdf').to_numpy()
data_array_match = np.all(data_array == data_array_xarray)
data_array_match

True

Dataset size:

In [31]:
data_array_size = os.path.getsize('data_array.nc') / (1024. * 1024.)

Add data to comparison list:

In [32]:
dataset_comparison.append(['NetCDF4', data_array_size, write_speed, read_speed, data_array_match])

# Compare file formats

In [33]:
dataset_comparison

[['CSV', 23.84185791015625, 690.147840143514, 294.35324600074506, True],
 ['npy', 7.6295166015625, 13.79394823858872, 2.7185198442748515, True],
 ['HDF5', 7.63134765625, 27.043306650001405, 3.9708429342863387, True],
 ['NetCDF4', 7.637290954589844, 28.81735994295533, 12.23481194998645, True]]

In [34]:
dataset_comparison_df = pd.DataFrame(columns=['File format', 'File size [MB]', 'Write time [ms]', 'Read time [ms]', 'Data matches exactly'], data=dataset_comparison)

In [35]:
dataset_comparison_df

Unnamed: 0,File format,File size [MB],Write time [ms],Read time [ms],Data matches exactly
0,CSV,23.841858,690.14784,294.353246,True
1,npy,7.629517,13.793948,2.71852,True
2,HDF5,7.631348,27.043307,3.970843,True
3,NetCDF4,7.637291,28.81736,12.234812,True


In [36]:
dataset_comparison_df.to_csv('../content/format_comparison_array.csv', float_format='%.3g', index=False)