# Reading Binary Files

In [1]:
# imports required but not shown in the video lecture.
from numpy import array, dtype, int32, memmap

In [2]:
# Create binary files
content = ('\x06\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?'
           '\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x10@\x00'
           '\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00'
           '\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00&@'
           '\x00\x00\x00\x00\x00\x00(@\x00\x00\x00\x00\x00\x00*@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00'
           '\x00.@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x001@\x00\x00\x00\x00\x00\x002@\x00\x00\x00'
           '\x00\x00\x003@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x005@\x00\x00\x00\x00\x00\x006@\x00'
           '\x00\x00\x00\x00\x007@\x00\x00\x00\x00\x00\x008@\x00\x00\x00\x00\x00\x009@\x00\x00\x00\x00\x00\x00:@'
           '\x00\x00\x00\x00\x00\x00;@\x00\x00\x00\x00\x00\x00<@\x00\x00\x00\x00\x00\x00=@'
          )
with open('data.bin', 'wb') as f:
    f.write(content)
with open('writable.bin', 'wb') as f:
    f.write(content)

## Working With File Header

Create a dtype to represent the header.

In [3]:
header_dtype = dtype([('rows', int32), ('cols', int32)])

Create a memory mapped array using this dtype. Note the shape is empty.

In [6]:
header = memmap('data.bin', mode='r', dtype=header_dtype, shape=())
header

memmap((6, 5), 
      dtype=[('rows', '<i4'), ('cols', '<i4')])

Read the row and column sizes from using this structured array.

In [5]:
rows = header['rows']
cols = header['cols']
rows, cols

(memmap(6, dtype=int32), memmap(5, dtype=int32))

Create a memory map to the data segment, using rows, cols for shape information and the header size to determine the correct offset.

In [7]:
data = memmap('data.bin', mode="r+", dtype='float64',
              shape=(rows, cols), offset=header_dtype.itemsize)

In [8]:
data

memmap([[  0.,   1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.,   9.],
       [ 10.,  11.,  12.,  13.,  14.],
       [ 15.,  16.,  17.,  18.,  19.],
       [ 20.,  21.,  22.,  23.,  24.],
       [ 25.,  26.,  27.,  28.,  29.]])

## Memory Maps with ndarray

`mmap` is a standard Python module for working with memory maps.

In [9]:
import mmap 
import numpy

Create a dtype to represent the header.

In [10]:
header_dtype = numpy.dtype([('rows', int32), ('cols', int32)])

Open a file for read/write access in binary mode.

In [11]:
f = open('writable.bin', 'r+b')

Create a read-only memory map from the opened file with the correct size to read the header of the file.

In [12]:
mm = mmap.mmap(f.fileno(), header_dtype.itemsize,
               access=mmap.ACCESS_READ)

Create a new array using the ndarray constructor. The first argument is the shape, and we pass in the data type and the memory buffer to use (mm) as keyword arguments.

In [13]:
header = numpy.ndarray((), dtype=header_dtype, buffer=mm)
rows = header['rows']
cols = header['cols']

In [14]:
rows, cols

(array(6, dtype=int32), array(5, dtype=int32))

Create a writable memory map to use for the data array. The size of the memory map in bytes is the size of a float64 (8) x rows x columns.

In [None]:
mm = mmap.mmap(f.fileno(), 8*rows*cols + header_dtype.itemsize, access=mmap.ACCESS_WRITE)

Create our data array using this new memory map. Start the arrays data at the memory location directly after the header using offset.

In [None]:
data = numpy.ndarray((rows, cols), dtype='float64', buffer=mm,
                     offset=header_dtype.itemsize)

In [15]:
data

memmap([[  0.,   1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.,   9.],
       [ 10.,  11.,  12.,  13.,  14.],
       [ 15.,  16.,  17.,  18.,  19.],
       [ 20.,  21.,  22.,  23.,  24.],
       [ 25.,  26.,  27.,  28.,  29.]])

## Structured Arrays

| Name         |      Time |       Value |
|:-------------|----------:|------------:|
| __char[12]__ | __int64__ | __float32__ |
| MSFT_profit  |        10 |        6.20 |
| GOOG_profit  |        12 |       -1.08 |
| MSFT_profit  |        18 |        8.40 |
| INTC_profit  |        25 |       -0.20 |
| ...          |       ... |         ... |
| ...          |       ... |         ... |
| GOOG_profit  |   1000325 |        3.20 |
| GOOD_profit  |   1000350 |        4.50 |
| INTC_profit  |   1000385 |       -1.05 |
| MSFT_profit  |   1000390 |        5.60 |


### memmap single array
Elements of array can be any fixed-size data structure!

In [16]:
import numpy as np
fmt = np.dtype ([('name', 'S12'),
                 ('time', np.int64),
                 ('value', np.float32)])
v = [('MSFT_profit', 10, 6.20),
     ('GOOG_profit', 12, -1.08),
     ('INTC_profit', 1000385, -1.05),
     ('MSFT_profit', 1000390, 5.60)]

In [17]:
arr = np.array(v, dtype=fmt)
arr

array([('MSFT_profit', 10, 6.199999809265137),
       ('GOOG_profit', 12, -1.0800000429153442),
       ('INTC_profit', 1000385, -1.0499999523162842),
       ('MSFT_profit', 1000390, 5.599999904632568)], 
      dtype=[('name', 'S12'), ('time', '<i8'), ('value', '<f4')])

Save the data to disk.

In [18]:
arr.tofile('db.dat')

And read it back with:

In [19]:
arr2 = np.fromfile('db.dat', dtype=fmt)
arr2

array([('MSFT_profit', 10, 6.199999809265137),
       ('GOOG_profit', 12, -1.0800000429153442),
       ('INTC_profit', 1000385, -1.0499999523162842),
       ('MSFT_profit', 1000390, 5.599999904632568)], 
      dtype=[('name', 'S12'), ('time', '<i8'), ('value', '<f4')])

In [20]:
#or
arr3 = np.memmap('db.dat', dtype=fmt, mode='c')
arr3

memmap([('MSFT_profit', 10, 6.199999809265137),
       ('GOOG_profit', 12, -1.0800000429153442),
       ('INTC_profit', 1000385, -1.0499999523162842),
       ('MSFT_profit', 1000390, 5.599999904632568)], 
      dtype=[('name', 'S12'), ('time', '<i8'), ('value', '<f4')])