## Chapter 5 data index search and storage

In [9]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)

In [3]:
a = np.random.randn(3,4)

In [4]:
a

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986],
       [-0.23415337, -0.23413696,  1.57921282,  0.76743473],
       [-0.46947439,  0.54256004, -0.46341769, -0.46572975]])

In [5]:
a[2][2] = np.nan

In [6]:
a

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986],
       [-0.23415337, -0.23413696,  1.57921282,  0.76743473],
       [-0.46947439,  0.54256004,         nan, -0.46572975]])

In [7]:
np.savetxt('np.csv',a,fmt='%.2f',delimiter=',',header='#1,#2,#3,#4')

In [10]:
df = pd.DataFrame(a)

In [11]:
df

Unnamed: 0,0,1,2,3
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,,-0.46573


In [12]:
df.to_csv('pd.csv',float_format='%.2f',na_rep='NAN!')

In [13]:
from tempfile import NamedTemporaryFile

In [14]:
from os.path import getsize

In [15]:
getsize('pd.csv')

80

In [16]:
a= np.random.randn(365,4)

In [17]:
a

array([[ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
       [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ],
       [ 1.46564877, -0.2257763 ,  0.0675282 , -1.42474819],
       ...,
       [-0.13845598, -1.22429824, -0.20902326, -0.85052045],
       [-0.58052345,  0.5885784 ,  1.6699045 ,  0.39467153],
       [-1.19588306,  0.44460267,  1.19663149, -0.6097829 ]])

In [19]:
tmpf = NamedTemporaryFile()

In [21]:
tmpf.name

'/var/folders/35/vkjljcjn5t994nllqfv_7mm80000gn/T/tmp4jfoliyo'

In [22]:
np.savetxt(tmpf,a,delimiter=',')
print(getsize(tmpf.name))

36690


In [23]:
tmpf = NamedTemporaryFile()
tmpf.name

'/var/folders/35/vkjljcjn5t994nllqfv_7mm80000gn/T/tmpom2e_876'

In [24]:
np.save(tmpf,a)
print(getsize(tmpf.name))

11808


In [26]:
tmpf.seek(0)
loaded = np.load(tmpf)
loaded.shape

(365, 4)

In [27]:
loaded

array([[ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
       [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ],
       [ 1.46564877, -0.2257763 ,  0.0675282 , -1.42474819],
       ...,
       [-0.13845598, -1.22429824, -0.20902326, -0.85052045],
       [-0.58052345,  0.5885784 ,  1.6699045 ,  0.39467153],
       [-1.19588306,  0.44460267,  1.19663149, -0.6097829 ]])

In [28]:
df = pd.DataFrame(a)

In [29]:
df.to_pickle(tmpf.name)

In [30]:
getsize(tmpf.name)

12309

In [31]:
pd.read_pickle(tmpf.name)

Unnamed: 0,0,1,2,3
0,0.241962,-1.913280,-1.724918,-0.562288
1,-1.012831,0.314247,-0.908024,-1.412304
2,1.465649,-0.225776,0.067528,-1.424748
3,-0.544383,0.110923,-1.150994,0.375698
4,-0.600639,-0.291694,-0.601707,1.852278
...,...,...,...,...
360,0.240753,2.601683,0.565510,-1.760763
361,0.753342,0.381158,1.289753,0.673181
362,-0.138456,-1.224298,-0.209023,-0.850520
363,-0.580523,0.588578,1.669905,0.394672


In [38]:
df.iloc[0,1]

-1.913280244657798

In [41]:
df.columns

RangeIndex(start=0, stop=4, step=1)

In [42]:
df.values

array([[ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
       [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ],
       [ 1.46564877, -0.2257763 ,  0.0675282 , -1.42474819],
       ...,
       [-0.13845598, -1.22429824, -0.20902326, -0.85052045],
       [-0.58052345,  0.5885784 ,  1.6699045 ,  0.39467153],
       [-1.19588306,  0.44460267,  1.19663149, -0.6097829 ]])

#### PyTables

In [43]:
a

array([[ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
       [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ],
       [ 1.46564877, -0.2257763 ,  0.0675282 , -1.42474819],
       ...,
       [-0.13845598, -1.22429824, -0.20902326, -0.85052045],
       [-0.58052345,  0.5885784 ,  1.6699045 ,  0.39467153],
       [-1.19588306,  0.44460267,  1.19663149, -0.6097829 ]])

In [44]:
import tables

In [45]:
tmpf = NamedTemporaryFile()

In [48]:
h5file = tables.File(tmpf.name,mode='w',title='Numpy Array')

In [49]:
root = h5file.root

In [51]:
h5file.create_array(root,"array",a)

/array (Array(365, 4)) ''
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [52]:
h5file.close()

In [53]:
h5file = tables.File(tmpf.name,'r')

In [54]:
getsize(tmpf.name)

13728

In [55]:
for node in h5file.iter_nodes(h5file.root):
    b = node.read()
    print(type(b))
    print(b.shape)
    print(b)

<class 'numpy.ndarray'>
(365, 4)
[[ 0.24196227 -1.91328024 -1.72491783 -0.56228753]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037 ]
 [ 1.46564877 -0.2257763   0.0675282  -1.42474819]
 ...
 [-0.13845598 -1.22429824 -0.20902326 -0.85052045]
 [-0.58052345  0.5885784   1.6699045   0.39467153]
 [-1.19588306  0.44460267  1.19663149 -0.6097829 ]]


In [56]:
h5file.close()