# NumPy ndarray
> avoid *from numpy import ** , which will cause confusion and conflict with other modules

In [1]:
import numpy as np
x = np.array([[1, 2, 3], [2, 3, 4]])
print(x)

[[1 2 3]
 [2 3 4]]


* Use **ndim** attribute to get the dimmension of an array

In [2]:
x.ndim

2

In [3]:
x.shape

(2, 3)

## Array indexing and slicing

In [3]:
x = np.random.random((3, 4, 5))
x

array([[[ 0.26437847,  0.80164233,  0.82625471,  0.59607897,  0.83895227],
        [ 0.76398099,  0.71879652,  0.37390207,  0.25422914,  0.68812365],
        [ 0.93781487,  0.65187547,  0.72149537,  0.79652745,  0.71108631],
        [ 0.73442666,  0.13090776,  0.70074357,  0.74525315,  0.6073456 ]],

       [[ 0.42210814,  0.91178805,  0.06946539,  0.73285394,  0.15558986],
        [ 0.64783686,  0.77577682,  0.748246  ,  0.38964352,  0.58910809],
        [ 0.59608485,  0.55435544,  0.80492638,  0.74173767,  0.5881487 ],
        [ 0.81669809,  0.301606  ,  0.49532253,  0.96698657,  0.06939879]],

       [[ 0.68660551,  0.36002865,  0.35094605,  0.52450347,  0.0518192 ],
        [ 0.09791141,  0.83985057,  0.78365145,  0.39651623,  0.78734257],
        [ 0.48637775,  0.55254813,  0.88448288,  0.21418904,  0.5891296 ],
        [ 0.59833343,  0.80132861,  0.05866252,  0.74726322,  0.67708837]]])

In [10]:
x[(1, 2, 3)]

0.63885411730593111

In [17]:
x[1, 1:, 1]

array([ 0.45043811,  0.73449053,  0.97101124])

In [18]:
y = np.random.random((3, 4))
y

array([[ 0.35275133,  0.54398307,  0.94688182,  0.12506365],
       [ 0.66073845,  0.9606453 ,  0.34734895,  0.44136876],
       [ 0.26642123,  0.28365086,  0.45140476,  0.3218879 ]])

In [19]:
y[1, :] // 2nd row

array([ 0.66073845,  0.9606453 ,  0.34734895,  0.44136876])

In [21]:
y[1, :-1]  // 2nd row

array([ 0.66073845,  0.9606453 ,  0.34734895])

In [22]:
y[:, 1]  // second column

array([ 0.54398307,  0.9606453 ,  0.28365086])

In [24]:
y[:, ::-1]  // reverse column

array([[ 0.12506365,  0.94688182,  0.54398307,  0.35275133],
       [ 0.44136876,  0.34734895,  0.9606453 ,  0.66073845],
       [ 0.3218879 ,  0.45140476,  0.28365086,  0.26642123]])

In [25]:
y[::-1, :]  // reverse row

array([[ 0.26642123,  0.28365086,  0.45140476,  0.3218879 ],
       [ 0.66073845,  0.9606453 ,  0.34734895,  0.44136876],
       [ 0.35275133,  0.54398307,  0.94688182,  0.12506365]])

In [5]:
x[1, 2]

array([ 0.59608485,  0.55435544,  0.80492638,  0.74173767,  0.5881487 ])

## Memory layout of ndarray
**flags** attribute holds information about the memory layout of the array.
* C_CONTIGUOUS indicates whether the array was C-style array: row-major indexing
* F_CONTIGUOUS indicates whether the array was Fortran-style array: column-major indexing 

It is very important to know the difference, which can speed up your program.

In [6]:
x.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

**Example**:

In [18]:
c_array = np.random.rand(10000, 10000)
f_array = np.asfortranarray(c_array)
def sum_row(x):
    return np.sum(x[0, :])
def sum_column(x):
    return np.sum(x[:, 0])

%timeit sum_row(c_array)
%timeit sum_row(f_array)
%timeit sum_column(c_array)
%timeit sum_column(f_array)

The slowest run took 5.90 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 6.99 µs per loop
The slowest run took 5.25 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 19.3 µs per loop
10000 loops, best of 3: 90 µs per loop
100000 loops, best of 3: 7.16 µs per loop


## Views and copies
2 ways of accessing data by slicing and indexing - copies and views: you can either access elements directly or create a copy of the array that contains only the accessed elements.
Use **may_share_memory** to check whether two arrays are copies or views of each other. While this method does the job in most cases, it is not always reliable, since it uses heuristics.

In [2]:
x = np.random.rand(100, 10)

In [3]:
y = x[:5, :]

In [4]:
np.may_share_memory(x, y)  # y is a view of x

True

* y is a view(a reference to x) of x, if we change y, x will be changed too.

In [6]:
y [:] = 0
x[:5,:]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

* y is a copy of x, which is independent from x. Changing y won't affect x

In [13]:
x = np.random.rand(100, 10)
y = np.empty([5, 10])
y[:] = x[:5, :]
print('y: ', y)
y[:] = 0
print('x: ', x[:5, :])
print('y: ', y)

y:  [[ 0.31446063  0.40567077  0.52808131  0.92165188  0.71207395  0.40662951
   0.56544913  0.9139226   0.13805357  0.8929419 ]
 [ 0.63991351  0.31606078  0.39307608  0.79875712  0.00159579  0.90698844
   0.07898949  0.21082609  0.16894868  0.35943743]
 [ 0.85636721  0.56875043  0.28457974  0.42140064  0.18784317  0.05623059
   0.81073741  0.98003296  0.09038954  0.57298738]
 [ 0.10199465  0.56411442  0.0921287   0.78169324  0.29586504  0.02769341
   0.88835059  0.72881954  0.17525631  0.68224582]
 [ 0.85123524  0.18607454  0.84929094  0.67369407  0.04138161  0.1624432
   0.6355854   0.44047373  0.61986576  0.81339425]]
x:  [[ 0.31446063  0.40567077  0.52808131  0.92165188  0.71207395  0.40662951
   0.56544913  0.9139226   0.13805357  0.8929419 ]
 [ 0.63991351  0.31606078  0.39307608  0.79875712  0.00159579  0.90698844
   0.07898949  0.21082609  0.16894868  0.35943743]
 [ 0.85636721  0.56875043  0.28457974  0.42140064  0.18784317  0.05623059
   0.81073741  0.98003296  0.09038954  0.57

## Creating Arrays
Arrays can be created from:
* instance from other data structure
* reading files on disk
* web
In this section, we will use list or functions in numpy

### Creating arrays from lists
To create a valid array object, arguments to array functions need to adhere to at least one of the following conditions:
* It has to be a valid iterable value or sequence, which may be nested
* It must have an **__array__** method that returns a valid numpy array
> The np.array() function will normally cast all input elements into the most suitable data type required for the array.

In [19]:
x = np.array([1, 2, 3, 'hello'])  # all to string
y = np.array(['hello', 'world'])  # all string
z = np.array([1, 2, 3, 4.5, 6.9, 'hello'])  # all to string
a = np.array((1, 2, 3, 4.5, 6.9, 'hello'))  # all to string
print(x, y, z, a)

['1' '2' '3' 'hello'] ['hello' 'world'] ['1' '2' '3' '4.5' '6.9' 'hello'] ['1' '2' '3' '4.5' '6.9' 'hello']


In [21]:
np.arange(5)  # range creates array

array([0, 1, 2, 3, 4])

In [22]:
np.array([[1, 2, 3, 4], [1, 2, 3, '5']])  # nested list creates 2 dimentaion array

array([['1', '2', '3', '4'],
       ['1', '2', '3', '5']], 
      dtype='<U21')

### Creating random arrays
* Create random arrays
* Create random permutations of arrays
* Generate arrays with specific probability distributions

In [27]:
x = np.random.rand(2, 2, 3)
x.shape

(2, 2, 3)

In [28]:
y = np.random.random((2, 2, 3))
y.shape

(2, 2, 3)

> rand is a convenience function for random. these two functions can only create arrays of floats.

Use **randint()** to create arrays of integers

In [34]:
LOW, HIGH = 1, 11
SIZE = (5, 10)
np.random.randint(LOW, HIGH, SIZE)

array([[ 7,  7,  7,  9,  7,  2,  7,  5,  7, 10],
       [ 8,  4,  5,  4,  6,  2,  7, 10,  9,  6],
       [ 5,  4,  2,  8,  8,  6,  4,  6,  8,  7],
       [ 6,  6,  9,  5, 10,  6,  4, 10,  8,  6],
       [ 5, 10,  9,  9,  1,  5,  3,  4,  8,  5]])

### Other arrays
zeros(), ones(), eye(), and others

In [4]:
np.zeros((2, 5))

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [5]:
np.ones((2, 4))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])

In [9]:
np.eye(5)

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1.]])

## Array data types
**dtype** attribute

In [10]:
x = np.random.random((10, 10))

In [11]:
x.dtype

dtype('float64')

In [12]:
x = np.arange(10)

In [13]:
x.dtype

dtype('int64')

In [14]:
x = np.array(['hello', 'numpy'])

In [15]:
x.dtype

dtype('<U5')

In [16]:
x = np.ones((10, 10), dtype=np.int)

In [17]:
x.dtype

dtype('int64')

In [20]:
x = np.zeros((10, 10), dtype='<U5')

In [21]:
x.dtype

dtype('<U5')

# Using numpy arrays
You can use array indexing and slicing to quickly access your data or perform a computation while keeping the efficiency as the C arrays.
* Basic operations and attributes of numpy arrays
* Universal functions and helper functions
* Broadcasting rules and shape manipulation
* Masking numpy arrays

## Vectorized operations
All numpy operations are vectorized, where you apply operations to the whole array instead of on each element individually. This is not only neat and handy but also improves the performance of computation compared to using loops.  
  
Start with a simple scalar operation:

In [24]:
x = np.array([1, 2, 3, 4])

In [25]:
x + 1

array([2, 3, 4, 5])

The elements in a numpy array all have the same **dtype**; which can save time on checking the type of each element at runtime, which, ordinarily, is done by Python.

In [40]:
y = np.array([-1, 3.1, 2, 0])
y.dtype

dtype('float64')

In [43]:
(x * y).dtype

dtype('float64')

The arithmetic operations between two numpy arrays are not matrix multiplications. A matrix multiplication will use **numpy.dot()**

In [45]:
np.dot(x, y)

11.199999999999999

Logic comparision between 2 arrays

In [46]:
x == y

array([False, False, False, False], dtype=bool)

Prove improved performance of NumPy. The improvement is due to a consistent **dtype** in a NumPy array.

In [52]:
x = np.arange(1000000)
%timeit x + 1

The slowest run took 16.67 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 1.13 ms per loop


In [53]:
y = range(1000000)
%timeit [i + 1 for i in y]

10 loops, best of 3: 54.4 ms per loop


> **Tip:**  
Always consider **dtype** before you apply any operation.

Huge difference - same operation with different **dtype**

In [55]:
x = np.arange(1, 9)
x.dtype

dtype('int64')

In [57]:
x = x / 10.0
x

array([ 0.01,  0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08])

In [58]:
x.dtype

dtype('float64')

In [62]:
y = np.arange(1, 9)
y /= 10

TypeError: ufunc 'true_divide' output (typecode 'd') could not be coerced to provided output parameter (typecode 'l') according to the casting rule ''same_kind''

> **Note:**  
When x is divided by a float, a new array is created with *dtype = numpy.float64* x points to a totally new object(NumPy Array)  
y uses /= sign, which always honors the **dtype** of the y array. **TypeError** is raised.

## Universal functions(ufuncs)
Use NumPy universal functions to eliminate as many loops as you can to optimize your code.
### Basic ufuncs

In [2]:
x = np.arange(5, 10)

In [3]:
np.square(x)

array([25, 36, 49, 64, 81])

In [4]:
y = np.ones(5) * 10
np.mod(y, x)  # y mod  x

array([ 0.,  4.,  3.,  2.,  1.])

In [7]:
np.log2(np.array([2, 4, 8, 16]))

array([ 1.,  2.,  3.,  4.])

In [9]:
np.minimum(x, 7)  # 7 it is broadcast to [7, 7, 7, 7, 7]

array([5, 6, 7, 7, 7])

In [10]:
np.min(x)

5

### Work with advanced ufuncs

In [11]:
z = np.repeat(x, 3).reshape(5, 3)
z

array([[5, 5, 5],
       [6, 6, 6],
       [7, 7, 7],
       [8, 8, 8],
       [9, 9, 9]])

In [12]:
np.median(z)

7.0

In [13]:
np.median(z, axis=0)  # based on column

array([ 7.,  7.,  7.])

In [14]:
np.median(z, axis=1)  # based on row

array([ 5.,  6.,  7.,  8.,  9.])

In [15]:
np.add.accumulate(x)

array([ 5, 11, 18, 26, 35])

In [16]:
np.multiply.outer(x, x)

array([[25, 30, 35, 40, 45],
       [30, 36, 42, 48, 54],
       [35, 42, 49, 56, 63],
       [40, 48, 56, 64, 72],
       [45, 54, 63, 72, 81]])

## Broadcasting adn shape manipulation
NumPy provides the flexibility to broadcast a smaller-sized array across a larger one.
### Broadcasting rules
* 2 arrays should be of equal dimensions
* One of them is 1

If the preceding conditions are not met, a **ValueError** exception will be thrown to indicate that the arrays have incompatible shapes.

In [17]:
x = np.array([
        [0, 0, 0],
        [10, 10, 10],
        [20, 20, 20]
    ])
y = np.array([1, 2, 3])
x + y

array([[ 1,  2,  3],
       [11, 12, 13],
       [21, 22, 23]])

![img](image_03_001.jpg)

In [19]:
x = np.array([[0], [10], [20]])
x + y

array([[ 1,  2,  3],
       [11, 12, 13],
       [21, 22, 23]])

![img](image_03_002.jpg)

In [20]:
x = np.array([
        [0, 0, 0],
        [10, 10, 10],
        [20, 20, 20]
    ])
y = np.array([1, 5])
x + y

ValueError: operands could not be broadcast together with shapes (3,3) (2,) 

> 2 arrays can't meet the requirement of broadcasting rules, they have different shapes in row and none of them are equal to 1

In [25]:
y = np.array([
        [1, 2, 3],
        [2, 3, 4]
    ])
x + y

ValueError: operands could not be broadcast together with shapes (3,3) (2,3) 

### Reshape NumPy arrays
I is common for you to create a NumPy array in just one dimension, reshaping it to a multidimension later, or vice versa.  
You can change the shape of your array, but the number of elements should not be changed.

In [31]:
x = np.arange(30)
x.shape = (2, 3, -1)
x

array([[[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]],

       [[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]])

> -1 means the remaining shape size of the transferred array

In [35]:
x = np.arange(1000000)
x.shape = (100, 100, 100)
%timeit x.flatten()

1000 loops, best of 3: 1.02 ms per loop


In [33]:
%timeit x.ravel()

The slowest run took 29.57 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 3: 182 ns per loop


> **np.flatten()** creates a copy from the original array, while **np.ravel()** just changes the view.  
**np.flatten()**: Return a copy of the array collapsed into one dimension.

In [37]:
x = np.arange(100)
x.shape = (10, 10)
x

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [38]:
x.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [39]:
x.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

### Vector stacking
How do we construct a two or multidimensional array by equally-sized row vectors?

In [48]:
x = np.arange(0, 10, 2)
y = np.arange(0, -5, -1)
np.vstack([x, y])

array([[ 0,  2,  4,  6,  8],
       [ 0, -1, -2, -3, -4]])

> **numpy.vstack()** constructs the new array by vertically stacking two input arrays. the new array is two-dimensional

**numpy.hstack()** combines two arrays horizontally, the new array is still one-dimensional, whick works like concat

In [49]:
np.hstack([x, y])

array([ 0,  2,  4,  6,  8,  0, -1, -2, -3, -4])

**numpy.dstack()** stacks the arrays in sequence depth-wise along the third dimension so that *the new array is three-dimensional*

In [50]:
np.dstack([x, y])

array([[[ 0,  0],
        [ 2, -1],
        [ 4, -2],
        [ 6, -3],
        [ 8, -4]]])

** ①numpy.resize vs. ②ndarray.resize**  
* ① enlarge the array, it will repeat itself until it reaches the new size; otherwise, it will truncate the array to the new size.
* ② the enlarging part will bbe zero(s), not repeating the array itself.

You can't use **ndarray.resize()** if you have assigned the array to another variable. **numpy.resize()** creates a new array with specified shapes, which have fewer limitations than **ndarray.resize()**, and is a more preferrable operation to use to change the size of your numpy array.

In [51]:
x = np.arange(3)
np.resize(x, (8, ))

array([0, 1, 2, 0, 1, 2, 0, 1])

In [53]:
x.resize(8)
x

array([0, 1, 2, 0, 0, 0, 0, 0])

## A boolean mask

In [54]:
# build a boolean mask
x = np.array([1, 3, -1, 5, 7, -11])
mask = (x < 0)
mask

array([False, False,  True, False, False,  True], dtype=bool)

In [55]:
x[mask] = 0
x

array([1, 3, 0, 5, 7, 0])

> **Note**:  
Using mask, we gain the ability to access or replace any element value in our arrays without knowing their index.

In [60]:
x = np.random.random(10)
x

array([ 0.13751945,  0.92503243,  0.20294173,  0.77961942,  0.52968962,
        0.78644437,  0.88993952,  0.0824877 ,  0.52401805,  0.57873771])

In [61]:
(x > .5).sum()

7

> **Note**:  
There are 7 elements in x are larger than 0.5

## Helper functions
Like **help()** and **dir()** in Python, NumPy provides **numpy.lookfor()** help function to help you find right function you need.

In [62]:
np.lookfor('resize')

Search results for 'resize'
---------------------------
numpy.ma.resize
    Return a new masked array with the specified size and shape.
numpy.chararray.resize
    Change shape and size of array in-place.
numpy.resize
    Return a new array with the specified shape.
numpy.memmap
    Create a memory-map to an array stored in a *binary* file on disk.
numpy.chararray
    chararray(shape, itemsize=1, unicode=False, buffer=None, offset=0,
numpy.ma.timer_comparison.ModuleTester.test_3
    Tests resize/repeat
numpy.ma.MaskedArray.resize


In [42]:
x = np.array([
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]
    ])
y = np.array([
        [2, 3, 4, 5],
        [2, 3, 4, 5],
        [2, 3, 4, 5]
    ])
x = np.matrix(x)
y = np.matrix(y)
x * y

matrix([[12, 18, 24, 30],
        [12, 18, 24, 30],
        [12, 18, 24, 30]])

In [46]:
y.shape = (4, 3)
y

matrix([[2, 3, 4],
        [5, 2, 3],
        [4, 5, 2],
        [3, 4, 5]])

# NumPy core and libs submodules
* The core of NumPy arrays: memory layout
* Structure arrays(record arrays)
* Date-time in NumPy arrays
* File I/O in NumPy arrays

## Introducing strides
Strides are the indexing scheme in Numpy arrays, and indicate the number of bytes to jump to find the next element.  
The performance improvements of NumPy come from a homogeneous multi-dimensional array object with fixed-size items, the **numpy.ndarray** object.  
We already covered:
* shape(dimension)
* data type
* order(the C-style row-major indexing arrays and the Fortran style column-major arrays)  

Create a numpy array and take a look at the strides

In [63]:
x = np.arange(8, dtype=np.int8)
x

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int8)

In [64]:
x.strides

(1,)

In [65]:
str(x.data)

'<memory at 0x7f069540d408>'

> The strides represent the tuple of bytes to step in each dimension when traversion an array.

In [67]:
x.shape = 2, 4
x

array([[0, 1, 2, 3],
       [4, 5, 6, 7]], dtype=int8)

In [68]:
x.strides

(4, 1)

In [69]:
str(x.data)

'<memory at 0x7f06954b3558>'

In [71]:
x.shape = 1, 4, 2
x

array([[[0, 1],
        [2, 3],
        [4, 5],
        [6, 7]]], dtype=int8)

In [72]:
x.strides

(8, 2, 1)

In [73]:
str(x.data)

'<memory at 0x7f06955d08b8>'

> **Notes**:  
strides (4, 1) - means the elements in the first dimension are four bytes apart, and the array need to jump four bytes to find the next row, but the elements in the second dimension are still 1 byte apart, jumping one byte to find the next column.

Same content with different strides

In [8]:
x = np.ones((10000,))
y = np.ones((10000 * 1000, ))[::100]
x.shape, y.shape

((10000,), (100000,))

In [9]:
x == y

  if __name__ == '__main__':


False

In [10]:
x.strides, y.strides

((8,), (800,))

In [11]:
x.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [12]:
y.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

> **Note**:  
x array is continuous in both the C and Fortran order while y is not. x is created continuously, so in the same dimension each element is eight bytes apart; y is created from a subset of 10000 * 100 for every 100 elements, so the index schema in the memory layout is not continuous.

The memory layout does affect the performance

In [13]:
%timeit x.sum()

The slowest run took 298.74 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 5.16 µs per loop


In [14]:
%timeit y.sum()

The slowest run took 11.68 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 167 µs per loop


## Why memory layout affects performance
Typically with fixed cache size, when the stride size gets larger, the hit rate will be lower, while the miss rate will be higher. The cache hit time and miss time compose the average data access time.  
x with smaller stride is faster than the larger strides of y. The reason is that the CPU is pulling data from the main memory to its cache in blocks, and the smaller stride means fewer transfers are needed.  
See the following picture: it is obvious that if x and y are both required, 100 blue boxes of data, the required cache time for x will be less.
![img](image_04_001.jpg)

## Structured arrays
NumPy provides capabilities to create arrays of records, as multiple data types live in one NumPy array. But one principle in NumPy that still needs to be honored is that the data type in each field(column) needs to be homogeneous.

In [29]:
x = np.empty((2,), dtype=('i4,f4,a10'))
x[:] = [(1, 0.5, 'NumPy'), (10, -0.5, 'Essential')]
x

array([(1, 0.5, b'NumPy'), (10, -0.5, b'Essential')], 
      dtype=[('f0', '<i4'), ('f1', '<f4'), ('f2', 'S10')])

> **Note**:  
* i4 - i stands for signed integer, 4 means 4 bytes(32-bit)
* f4 - f stands for float, 4 means 4 bytes(32-bit)
* a10 - a string of length less than or equal to 10  
* < stands for byteorder *big-endian* ( indicating the memory address increase order)

>f0, f1, f2 are the default field name, you can specify your field names.

In [17]:
x[0]

(1, 0.5, b'NumPy')

In [18]:
x['f2']

array([b'NumPy', b'Essential'], 
      dtype='|S10')

> **Note**:  
Use both index and field name to obtain the value of certain field.

In [30]:
y = x['f0']
x, y

(array([(1, 0.5, b'NumPy'), (10, -0.5, b'Essential')], 
       dtype=[('f0', '<i4'), ('f1', '<f4'), ('f2', 'S10')]),
 array([ 1, 10], dtype=int32))

In [31]:
y[:] = y * 10
x, y

(array([(10, 0.5, b'NumPy'), (100, -0.5, b'Essential')], 
       dtype=[('f0', '<i4'), ('f1', '<f4'), ('f2', 'S10')]),
 array([ 10, 100], dtype=int32))

In [32]:
y[:] = y + 0.5
x, y

(array([(10, 0.5, b'NumPy'), (100, -0.5, b'Essential')], 
       dtype=[('f0', '<i4'), ('f1', '<f4'), ('f2', 'S10')]),
 array([ 10, 100], dtype=int32))

> **Note**:  
y is the view of field f0 in x. In the record arrays, the characteristics of NumPy arrays still remain. When multiply the scalar 10, it still applies to whole array of y(the broadcasting rule), and it always honors the data type. We add 0.5 to y, but since the data type of field f0 is a 32-bit integer, the result is till [10, 100]. Also, y is a view of f0 in x, so they share the same memory block. The values in x have also changed.
![img](image_05_001.jpg)

You can also prefix the string arguments with a repeated number or a shape to define the dimension of the field, but it's still considered as just one field in the record array.

In [33]:
z = np.ones((2,), dtype=('3i4, (2, 3)f4'))
z

array([([1, 1, 1], [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]),
       ([1, 1, 1], [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])], 
      dtype=[('f0', '<i4', (3,)), ('f1', '<f4', (2, 3))])

Access field names

In [36]:
x.dtype.names

('f0', 'f1', 'f2')

In [37]:
x.dtype.names = ('id', 'value', 'note')
x

array([(10, 0.5, b'NumPy'), (100, -0.5, b'Essential')], 
      dtype=[('id', '<i4'), ('value', '<f4'), ('note', 'S10')])

In [38]:
list_ex = np.zeros((2, ), dtype=[('id', 'i4'), ('value', 'f4', (2, ))])
list_ex

array([(0, [0.0, 0.0]), (0, [0.0, 0.0])], 
      dtype=[('id', '<i4'), ('value', '<f4', (2,))])

In [39]:
dict_ex = np.zeros((2, ), dtype={'names': ['id', 'value'], 'formats': ['i4', '2f4']})
dict_ex

array([(0, [0.0, 0.0]), (0, [0.0, 0.0])], 
      dtype=[('id', '<i4'), ('value', '<f4', (2,))])

> **Note**:  
* tupe - (field name, data type, shape), shape is optional, which also can be set with the data type argument
* dict - two required keys, and each key has an equally sized list of values: {'names': [], 'formats': []}

In [40]:
x[['id', 'note']]

array([(10, b'NumPy'), (100, b'Essential')], 
      dtype=[('id', '<i4'), ('note', 'S10')])

### Dates and time in NumPy
To differentiate from the **datetime** object in Python, the data type is called **datetime64**.

In [43]:
# create
x = np.datetime64('2015-04-01')
y = np.datetime64('2016-12')
x.dtype, y.dtype

(dtype('<M8[D]'), dtype('<M8[M]'))

In [45]:
y = np.datetime64('2016-12', 'D')
y, y.dtype

(numpy.datetime64('2016-12-01'), dtype('<M8[D]'))

In [46]:
x = np.arange('2015-01', '2015-04', dtype='datetime64[M]')
x

array(['2015-01', '2015-02', '2015-03'], dtype='datetime64[M]')

In [49]:
x = np.arange('2015-01', '2015-04', dtype='datetime64[D]')
x

array(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
       '2015-01-05', '2015-01-06', '2015-01-07', '2015-01-08',
       '2015-01-09', '2015-01-10', '2015-01-11', '2015-01-12',
       '2015-01-13', '2015-01-14', '2015-01-15', '2015-01-16',
       '2015-01-17', '2015-01-18', '2015-01-19', '2015-01-20',
       '2015-01-21', '2015-01-22', '2015-01-23', '2015-01-24',
       '2015-01-25', '2015-01-26', '2015-01-27', '2015-01-28',
       '2015-01-29', '2015-01-30', '2015-01-31', '2015-02-01',
       '2015-02-02', '2015-02-03', '2015-02-04', '2015-02-05',
       '2015-02-06', '2015-02-07', '2015-02-08', '2015-02-09',
       '2015-02-10', '2015-02-11', '2015-02-12', '2015-02-13',
       '2015-02-14', '2015-02-15', '2015-02-16', '2015-02-17',
       '2015-02-18', '2015-02-19', '2015-02-20', '2015-02-21',
       '2015-02-22', '2015-02-23', '2015-02-24', '2015-02-25',
       '2015-02-26', '2015-02-27', '2015-02-28', '2015-03-01',
       '2015-03-02', '2015-03-03', '2015-03-04', '2015-

In [51]:
y = np.datetime64('2016-12-20', 's')
y, y.dtype

(numpy.datetime64('2016-12-20T00:00:00'), dtype('<M8[s]'))

**timedelta64** in action

In [52]:
x = np.arange('2015-01', '2015-04', dtype='datetime64[M]')
x

array(['2015-01', '2015-02', '2015-03'], dtype='datetime64[M]')

In [53]:
y = np.datetime64('2015-01-01')
x - y

array([ 0, 31, 59], dtype='timedelta64[D]')

In [54]:
np.datetime64('2015', 'Y') + np.timedelta64(12, 'M')

numpy.datetime64('2016-01')

In [55]:
np.timedelta64(1, 'W') / np.timedelta64(1, 'D')

7.0

### File I/O and NumPy

* Create a record array

In [109]:
id = np.arange(1000)
value = np.random.random(1000)
day = np.random.randint(0, 365, 1000) * np.timedelta64(1, 'D')
date = np.datetime64('2015-01-01') + day
rec_array = np.core.records.fromarrays([id, value, date], names='id, value, date', formats='i4, f4, a10')


TypeError: string operation on non-string array

In [82]:
np.savetxt('record.csv', rec_array, fmt='%i,%.4f,%s')

* Read data from file into an array

In [83]:
read_array = np.genfromtxt('record.csv', dtype='i4, f4, a10', delimiter=',', skip_header=0)
read_array[:5]

array([(0, 0.9501000046730042, b"b'2015-02-"),
       (1, 0.11909999698400497, b"b'2015-03-"),
       (2, 0.7250000238418579, b"b'2015-10-"),
       (3, 0.29030001163482666, b"b'2015-07-"),
       (4, 0.6521999835968018, b"b'2015-06-")], 
      dtype=[('f0', '<i4'), ('f1', '<f4'), ('f2', 'S10')])

In [68]:
a = b'2015-09-20'
a.decode('utf-8')

'2015-09-20'