# Compare format

In [1]:
import numpy as np
import pandas as pd
import time
import _pickle as pkl
#import cPickle as pkl
import os
#import tables

batch_size = 10
embedding_size = 512
sample_num = 1000

a1 = np.random.normal(size=[embedding_size, sample_num])
label = np.random.normal(size=[embedding_size, 1])
all_batch = np.concatenate([a1, label], 1)

all_batches = []
for i in range(batch_size):
    all_batches.append(all_batch)
all_batches = np.array(all_batches)
print(all_batches.shape)


# dataframe
df = pd.DataFrame(all_batch)
df1 = pd.DataFrame()
for i in range(batch_size):
    df1 = pd.concat([df1,df])

(10, 512, 1001)


## 1. Pickle

In [2]:
s_t = time.time()
pkl_name = "a.pkl"
with open(pkl_name, "wb") as f:
    pkl.dump(all_batches, f)
pkl_in_time = time.time()-s_t
print("pkl dump costs {} sec".format(pkl_in_time))

s_t = time.time()
with open(pkl_name,'rb') as f:
    new_a = pkl.load(f)
pkl_out_time = time.time() - s_t
print("pkl load costs {} sec".format(pkl_out_time))
pkl_size = os.path.getsize(pkl_name)
print("pkl file size: {} byte, {} mb".format(pkl_size, float(pkl_size)/(1024*1024)))

pkl dump costs 0.3502919673919678 sec
pkl load costs 0.09722304344177246 sec
pkl file size: 41001122 byte, 39.10171699523926 mb


## 2. npy

In [3]:
s_t = time.time()
npy_name = "a.npy"
with open(npy_name, "wb") as f:
    np.save(f, arr=all_batches)
npy_in_time = time.time() - s_t
print("npy save costs {} sec".format(npy_in_time))

s_t = time.time()
with open(npy_name,'rb') as f:
    new_a = np.load(f)
npy_out_time = time.time() - s_t
print("npy load costs {} sec".format(npy_out_time))
npy_size = os.path.getsize(npy_name)
print("npy file size: {} byte, {} mb".format(npy_size, float(npy_size) / (1024*1024)))

npy save costs 0.036824703216552734 sec
npy load costs 0.04144906997680664 sec
npy file size: 41001088 byte, 39.1016845703125 mb


## 3. npz

In [4]:
s_t = time.time()
npz_name = "a.npz"
with open(npz_name, "wb") as f:
    np.savez(f, arr=all_batches)
npz_in_time = time.time() - s_t
print("npz save costs {} sec".format(npz_in_time))

s_t = time.time()
with open(npz_name,'rb') as f:
    npz_f= np.load(f)
    new_a = npz_f["arr"]
npz_out_time = time.time() - s_t
print("npz load costs {} sec".format(npz_out_time))
npz_size = os.path.getsize(npz_name)
print("npz file size: {} byte, {} mb".format(npz_size, float(npz_size) /(1024*1024)))

npz save costs 0.3072493076324463 sec
npz load costs 0.11807441711425781 sec
npz file size: 41001220 byte, 39.101810455322266 mb


## 4. h5

In [5]:
import tables

s_t = time.time()
table_name = "a.h5"
with tables.open_file(table_name,'w') as f:
#f = tables.open_file(table_name, 'w')
    atom = tables.Atom.from_dtype(all_batches.dtype)
    ds = f.create_carray(f.root, 'test_a', atom, all_batches.shape)
    ds[:] = all_batches
table_in_time = time.time() - s_t
print("table save costs {} sec".format(table_in_time))

s_t2 = time.time()
with tables.open_file(table_name,'r') as f:
#f2 = tables.openFile(table_name, "r")
    hdf5_data = f.root.test_a[:]
table_out_time = time.time() - s_t2
print("table load costs {} sec".format(table_out_time))
table_size = os.path.getsize(table_name)
print("table file size: {} byte, {} MB".format(table_size, float(table_size) / (1024*1024)))

table save costs 0.2012922763824463 sec
table load costs 0.06366372108459473 sec
table file size: 41024624 byte, 39.12413024902344 MB


## 5. feather
feather安装：
pip install feather-format

In [6]:
import feather

    #df1 = df1.append(pd.DataFrame(all_batch))
    #print(df1.shape)
s_t = time.time()
feather_name = 'a.feather'
with open(feather_name,'wb') as f:
    feather.write_dataframe(df1,f)
feather_in_time = time.time() -s_t
print("faether write costs {} sec".format(feather_in_time))

s_t2 = time.time()
with open(feather_name,'rb') as f:
    readFeather = feather.read_dataframe(f)
feather_out_time = time.time() -s_t2 

print("faether load costs {} sec".format(feather_out_time))
feather_size = os.path.getsize(feather_name)
print("feather file size: {} byte, {} MB".format(feather_size, float(feather_size) / (1024*1024)))

faether write costs 0.2934298515319824 sec
faether load costs 0.09006571769714355 sec
feather file size: 4650922 byte, 4.435464859008789 MB


## 6. Jay
!pip install datatable

In [7]:
import datatable as dt

s_t = time.time()
jay_name = 'a.jay'
dt.Frame(df1).to_jay(jay_name)
jay_in_time = time.time() - s_t
print("jay write costs {} sec".format(jay_in_time))

s_t2= time.time()
jay_data =dt.fread(jay_name)
jay_out_time = time.time() - s_t2
print("jay read costs {} sec".format(jay_out_time))

jay_size = os.path.getsize(jay_name)
print("jay file size: {} byte, {} MB".format(jay_size, float(jay_size) / (1024*1024)))

jay write costs 64.61445426940918 sec
jay read costs 0.08871698379516602 sec
jay file size: 41045088 byte, 39.143646240234375 MB


## 7. Parquet


In [8]:
import pyarrow.parquet as pq
import pyarrow as pa
Parquet_name = 'a.parquet'

s_t = time.time()
table = pa.Table.from_pandas(df1)
pq.write_table(table,Parquet_name)
Parquet_in_time = time.time() - s_t
print("Parquet write costs {} sec".format(Parquet_in_time))

s_t2 = time.time()
parquet_data = pd.read_parquet(Parquet_name)
Parquet_out_time = time.time() - s_t2
print("Parquet read costs {} sec".format(Parquet_out_time))

parquet_size = os.path.getsize(Parquet_name)
print("Parquet file size: {} byte, {} MB".format(parquet_size, float(parquet_size) / (1024*1024)))

Parquet write costs 0.42542076110839844 sec
Parquet read costs 0.20275330543518066 sec
Parquet file size: 5662560 byte, 5.400238037109375 MB
