# Compare file formats - array

In [1]:
import os
import numpy as np
import pandas as pd

n = 1000

data_array = np.random.uniform(size=(n,n))

In [2]:
dataset_comparison = []

## CSV

Write speed:

In [3]:
%timeit -o np.savetxt('data_array.txt', data_array)

920 ms ± 167 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 920 ms ± 167 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [4]:
write_speed = _.average

Read speed:

In [5]:
%timeit -o data_array_csv = np.loadtxt('data_array.txt')

870 ms ± 94.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


<TimeitResult : 870 ms ± 94.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>

In [6]:
read_speed = _.average

Datasets match:

In [7]:
data_array_csv = np.loadtxt('data_array.txt')
data_array_match = np.all(data_array == data_array_csv)
data_array_match

True

Dataset size:

In [8]:
data_array_size = os.path.getsize('data_array.csv') / (1024. * 1024.)

Add data to comparison list:

In [9]:
dataset_comparison.append(['CSV', data_array_size, write_speed, read_speed, data_array_match])

## npy

Write speed:

In [10]:
%timeit -o np.save('data_array.npy', data_array)

13.4 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 13.4 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [11]:
write_speed = _.average

Read speed:

In [12]:
%timeit -o data_array_npy = np.load('data_array.npy')

2.77 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 2.77 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [13]:
read_speed = _.average

Datasets match:

In [14]:
data_array_npy = np.load('data_array.npy')
data_array_match = np.all(data_array == data_array_npy)
data_array_match

True

Dataset size:

In [15]:
data_array_size = os.path.getsize('data_array.npy') / (1024. * 1024.)

Add data to comparison list:

In [16]:
dataset_comparison.append(['npy', data_array_size, write_speed, read_speed, data_array_match])

## NetCDF4

Write speed:

In [17]:
import xarray as xr

In [18]:
%timeit -o xr.DataArray(data_array).to_netcdf('data_array.nc')

16.9 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 16.9 ms ± 1.53 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [19]:
write_speed = _.average

Read speed:

In [20]:
%timeit -o data_array_xarray = xr.open_dataarray('data_array.nc').to_numpy()

7.09 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


<TimeitResult : 7.09 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)>

In [21]:
read_speed = _.average

Datasets match:

In [22]:
data_array_xarray = xr.open_dataarray('data_array.nc').to_numpy()
data_array_xarray

array([[0.81557478, 0.87582501, 0.33012117, ..., 0.07885366, 0.81177184,
        0.35970538],
       [0.71333504, 0.19451126, 0.95683144, ..., 0.96126088, 0.10705617,
        0.69977953],
       [0.20439815, 0.940112  , 0.0353687 , ..., 0.28519847, 0.37865593,
        0.58364631],
       ...,
       [0.47693547, 0.5508281 , 0.63747723, ..., 0.52337387, 0.91112467,
        0.37682465],
       [0.31090162, 0.01634938, 0.86742023, ..., 0.63394604, 0.14486111,
        0.65799037],
       [0.57867758, 0.08889103, 0.31783146, ..., 0.62367464, 0.84158192,
        0.23872403]])

In [23]:
data_array_xarray = xr.open_dataarray('data_array.nc').to_numpy()
data_array_match = np.all(data_array == data_array_xarray)
data_array_match

True

Dataset size:

In [24]:
data_array_size = os.path.getsize('data_array.nc') / (1024. * 1024.)

Add data to comparison list:

In [25]:
dataset_comparison.append(['NetCDF4', data_array_size, write_speed, read_speed, data_array_match])

# Compare file formats

In [26]:
dataset_comparison

[['CSV', 23.84185791015625, 0.9199028684275358, 0.870454854997141, True],
 ['npy', 7.6295166015625, 0.013367865845793858, 0.00277258609853951, True],
 ['NetCDF4', 7.63720703125, 0.01692108380142599, 0.007093449790014088, True]]

In [27]:
dataset_comparison_df = pd.DataFrame(columns=['File format', 'File size [MB]', 'Write time [ms]', 'Read time [ms]', 'Data matches exactly'], data=dataset_comparison)

In [28]:
dataset_comparison_df

Unnamed: 0,File format,File size [MB],Write time [ms],Read time [ms],Data matches exactly
0,CSV,23.841858,0.919903,0.870455,True
1,npy,7.629517,0.013368,0.002773,True
2,NetCDF4,7.637207,0.016921,0.007093,True
