# Advanced NumPy

In [2]:
from __future__ import division
from numpy.random import randn
from pandas import Series
import numpy as np
np.set_printoptions(precision=4)
import sys

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #'last' 기본값

## ndarray object internals

### NumPy dtype hierarchy

In [2]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

In [3]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

## Advanced array manipulation

### Reshaping arrays

In [5]:
arr = np.arange(8)
arr
arr.reshape((4, 2))

array([0, 1, 2, 3, 4, 5, 6, 7])

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [6]:
arr.reshape((4, 2)).reshape((2, 4)) # 전치 

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [7]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [9]:
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)

(3, 5)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [10]:
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel() # ravel은 새로운 객체를 만들지 않음

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [11]:
arr.flatten()  # 결과를 복제 후 return 

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C vs. Fortran order

c는 row 우선 fortran은 column 우선 <br>
c는 메모리에 행을 순서로 0 1 2 3 4 <br>
fortran은 1 4 8 1 5 9

In [12]:
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')
arr.ravel('C')

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [13]:
# 생략시 C 가 기본값
arr2 = np.arange(12).reshape((3,4), order='F')
arr2 

array([[ 0,  3,  6,  9],
       [ 1,  4,  7, 10],
       [ 2,  5,  8, 11]])

### Concatenating and Splitting arrays

행열 붙이고 자르기 

In [14]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0) # 행 붙이기 
np.concatenate([arr1, arr2], axis=1) # 열 붙이기 

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [15]:
# 2차원 스택
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [16]:
# arr[:1] / arr[1:3] / arr[3:] 로 분리됨 
from numpy.random import randn
arr = randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third

array([[-1.6611, -0.0741],
       [ 0.6484,  0.6166],
       [ 0.6967, -0.4309],
       [ 0.0563, -0.4311],
       [-0.5453, -0.7002]])

array([[-1.6611, -0.0741]])

array([[ 0.6484,  0.6166],
       [ 0.6967, -0.4309]])

array([[ 0.0563, -0.4311],
       [-0.5453, -0.7002]])

#### Stacking helpers: 

In [19]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = randn(3, 2)

arr1
arr2
# 데이터 타입을 일치 시켜줌 r_ 쌓기 , c_ 붙이기
np.r_[arr1, arr2] 
np.c_[np.r_[arr1, arr2], arr]
np.c_[arr1, arr2] 

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[-1.0822, -2.1326],
       [ 0.9882,  0.3896],
       [ 1.7349, -0.8208]])

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [-1.0822, -2.1326],
       [ 0.9882,  0.3896],
       [ 1.7349, -0.8208]])

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [-1.0822, -2.1326,  3.    ],
       [ 0.9882,  0.3896,  4.    ],
       [ 1.7349, -0.8208,  5.    ]])

array([[ 0.    ,  1.    , -1.0822, -2.1326],
       [ 2.    ,  3.    ,  0.9882,  0.3896],
       [ 4.    ,  5.    ,  1.7349, -0.8208]])

In [20]:
# sequence 값으로 채움 
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating elements: tile and repeat
반복 

In [4]:
arr = np.arange(3)
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [7]:
arr = randn(2, 2)
arr
arr.repeat(2, axis=0)
arr

array([[-0.4549, -0.8227],
       [ 0.2493,  0.4801]])

array([[-0.4549, -0.8227],
       [-0.4549, -0.8227],
       [ 0.2493,  0.4801],
       [ 0.2493,  0.4801]])

array([[-0.4549, -0.8227],
       [ 0.2493,  0.4801]])

In [8]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)

array([[-0.4549, -0.8227],
       [-0.4549, -0.8227],
       [ 0.2493,  0.4801],
       [ 0.2493,  0.4801],
       [ 0.2493,  0.4801]])

array([[-0.4549, -0.4549, -0.8227, -0.8227, -0.8227],
       [ 0.2493,  0.2493,  0.4801,  0.4801,  0.4801]])

In [9]:
# 타일 붙이듯이 반복해 붙임 
arr
np.tile(arr, 2)

array([[-0.4549, -0.8227],
       [ 0.2493,  0.4801]])

array([[-0.4549, -0.8227, -0.4549, -0.8227],
       [ 0.2493,  0.4801,  0.2493,  0.4801]])

In [10]:
arr
np.tile(arr, (2, 1)) # row 로 2개 column 1개 
np.tile(arr, (3, 2))

array([[-0.4549, -0.8227],
       [ 0.2493,  0.4801]])

array([[-0.4549, -0.8227],
       [ 0.2493,  0.4801],
       [-0.4549, -0.8227],
       [ 0.2493,  0.4801]])

array([[-0.4549, -0.8227, -0.4549, -0.8227],
       [ 0.2493,  0.4801,  0.2493,  0.4801],
       [-0.4549, -0.8227, -0.4549, -0.8227],
       [ 0.2493,  0.4801,  0.2493,  0.4801],
       [-0.4549, -0.8227, -0.4549, -0.8227],
       [ 0.2493,  0.4801,  0.2493,  0.4801]])

### Fancy indexing equivalents: take and put
원소를 뽑을때 정수값을 이용 하는 방법 

In [12]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr
arr[inds]

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

array([700, 100, 200, 600])

In [13]:
# take : 별로 쓸일이 없음 arr[inds] 와 같음 
arr.take(inds)
# put : 해당 인덱스에 값을 집어 넣음 
arr.put(inds, 42)
arr
# 해당 인덱스에 해당 값을 넣음 
arr.put(inds, [40, 41, 42, 43])
arr

array([700, 100, 200, 600])

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [14]:
inds = [2, 0, 2, 1]
arr = randn(2, 4)
arr
arr.take(inds, axis=1) # inds 에 해당하는 위치값으로 2개의 row 에서 뽑음 

array([[-0.8443,  0.935 ,  0.8904, -0.8638],
       [-0.1668, -0.1   ,  1.2965,  0.2639]])

array([[ 0.8904, -0.8443,  0.8904,  0.935 ],
       [ 1.2965, -0.1668,  1.2965, -0.1   ]])

## Broadcasting

(vectorizing)<br>
numpy 가 산술연산시 형상이 다른 배열을 처리하는 방법 <br>
차원이 큰 것에 맞춰서 연산함 (차원을 맞추고, 길이를 맞추고)

예) <br>
A = 8 x 1 x 6 x 1 (4차원)<br>
B = ______  7 x 1 x 5 (3차원)<br>
result = 8 x 7 x 6 x 5

두 값을 비교했을때 1 값이 있어야 브로드캐스팅이 가능하다. (A-1, B-5 비교해서 5로)


In [15]:
arr = np.arange(5)
arr
arr * 4

array([0, 1, 2, 3, 4])

array([ 0,  4,  8, 12, 16])

In [16]:
arr = randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)

array([ 0.0894,  0.8736, -0.6714])

array([[-1.3342,  0.3819, -1.1316],
       [-0.5956, -0.0325,  0.8409],
       [ 1.0679,  0.2255,  0.0609],
       [ 0.862 , -0.5749,  0.2298]])

array([ -5.5511e-17,   0.0000e+00,  -2.7756e-17])

In [17]:
arr
row_means = arr.mean(1)
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([[-1.2448,  1.2556, -1.8031],
       [-0.5062,  0.8411,  0.1695],
       [ 1.1573,  1.0992, -0.6105],
       [ 0.9514,  0.2987, -0.4416]])

array([[-0.5974],
       [ 0.1681],
       [ 0.5487],
       [ 0.2695]])

array([  0.0000e+00,   9.2519e-18,   0.0000e+00,  -3.7007e-17])

### Broadcasting over other axes

In [18]:
arr - arr.mean(1) # (4,3) (4,)  인 경우 에러

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [19]:
arr - arr.mean(1).reshape((4, 1))

array([[ -6.4737e-01,   1.8530e+00,  -1.2056e+00],
       [ -6.7435e-01,   6.7296e-01,   1.3876e-03],
       [  6.0867e-01,   5.5052e-01,  -1.1592e+00],
       [  6.8189e-01,   2.9221e-02,  -7.1111e-01]])

In [20]:
# 중간에 새로운 축을 추가함 np.newaxis,
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :] 
arr_3d.shape

(4, 1, 4)

In [21]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]

array([[ 0.3105],
       [ 0.7659],
       [ 0.5964]])

array([[ 0.3105,  0.7659,  0.5964]])

In [23]:
arr = randn(3, 4, 5)
arr
depth_means = arr.mean(2)
depth_means 
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[[-0.1627, -0.3471, -1.6603, -1.0015,  0.7074],
        [-1.4649, -0.0533, -1.3971,  1.7916, -0.3879],
        [ 0.1275, -0.0235,  0.8029,  0.4632,  0.4793],
        [ 0.3362, -0.7203, -0.9913,  0.144 ,  0.4918]],

       [[-0.5236, -0.6149,  0.2624,  0.6757, -1.0799],
        [-0.6959, -0.5137, -1.3415,  1.0566,  0.1961],
        [-0.4206, -0.4819,  1.5886, -0.0901, -0.0466],
        [ 0.6782,  0.0044,  1.1511, -0.7488,  0.5402]],

       [[-1.1964, -2.6732, -1.4008, -0.4171, -0.0545],
        [-0.851 ,  0.5707,  1.5874,  1.9146, -0.2462],
        [-0.0558,  0.2023, -0.4823, -1.3268, -1.6883],
        [-0.1373, -2.8082,  1.2845, -1.376 ,  0.3344]]])

array([[-0.4929, -0.3023,  0.3699, -0.1479],
       [-0.2561, -0.2597,  0.1099,  0.325 ],
       [-1.1484,  0.5951, -0.6702, -0.5405]])

array([[ -8.8818e-17,   4.4409e-17,   1.1102e-17,   0.0000e+00],
       [  0.0000e+00,  -1.1102e-17,  -1.6653e-17,  -3.3307e-17],
       [ -4.4409e-17,  -2.2204e-17,   0.0000e+00,   6.6613e-17]])

In [24]:
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalized things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

In [33]:
demeaned =demean_axis(arr, 0)
demeaned

array([[-1.6648, -1.6648, -1.6648],
       [ 0.2142,  0.2142,  0.2142],
       [ 0.1452,  0.1452,  0.1452],
       [ 1.3053,  1.3053,  1.3053]])

### Setting array values by broadcasting

In [25]:
arr = np.zeros((4, 3))
arr[:] = 5 # 4 x 3 로 확장되어 들어감 
arr 

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [34]:
col = np.array([1.28, -0.42, 0.44, 1.6])
col
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr

array([ 1.28, -0.42,  0.44,  1.6 ])

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## Advanced ufunc usage

### Ufunc instance methods

In [None]:
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

In [None]:
np.random.seed(12346)

In [None]:
arr = randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

In [None]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

In [None]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))

In [None]:
result = np.subtract.outer(randn(3, 4), randn(5))
result.shape

In [None]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

In [None]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

### Custom ufuncs

In [None]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

In [None]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

In [None]:
arr = randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

## Structured and record arrays

In [None]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

In [None]:
sarr[0]
sarr[0]['y']

In [None]:
sarr['x']

### Nested dtypes and multidimensional fields

In [None]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

In [None]:
arr[0]['x']

In [None]:
arr['x']

In [None]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']

### Why use structured arrays?

### Structured array manipulations: numpy.lib.recfunctions

## More about sorting

In [None]:
arr = randn(6)
arr.sort()
arr

In [None]:
arr = randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr

In [None]:
arr = randn(5)
arr
np.sort(arr)
arr

In [None]:
arr = randn(3, 5)
arr
arr.sort(axis=1)
arr

In [None]:
arr[:, ::-1]

### Indirect sorts: argsort and lexsort

In [None]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]

In [None]:
arr = randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]

In [None]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
zip(last_name[sorter], first_name[sorter])

### Alternate sort algorithms

In [None]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)

### numpy.searchsorted: Finding elements in a sorted array

In [None]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

In [None]:
arr.searchsorted([0, 8, 11, 16])

In [None]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')

In [None]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

In [None]:
labels = bins.searchsorted(data)
labels

In [None]:
Series(data).groupby(labels).mean()

In [None]:
np.digitize(data, bins)

## NumPy matrix class

In [None]:
X =  np.array([[ 8.82768214,  3.82222409, -1.14276475,  2.04411587],
               [ 3.82222409,  6.75272284,  0.83909108,  2.08293758],
               [-1.14276475,  0.83909108,  5.01690521,  0.79573241],
               [ 2.04411587,  2.08293758,  0.79573241,  6.24095859]])
X[:, 0]  # one-dimensional
y = X[:, :1]  # two-dimensional by slicing
X
y

In [None]:
np.dot(y.T, np.dot(X, y))

In [None]:
Xm = np.matrix(X)
ym = Xm[:, 0]
Xm
ym
ym.T * Xm * ym

In [None]:
Xm.I * X

## Advanced array input and output

### Memory-mapped files

In [None]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap

In [None]:
section = mmap[:5]

In [None]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [None]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

In [None]:
%xdel mmap
!rm mymmap

### HDF5 and other array storage options

## Performance tips

### The importance of contiguous memory

In [None]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous

In [None]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)

In [None]:
arr_f.copy('C').flags

In [None]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

In [None]:
%xdel arr_c
%xdel arr_f
%cd ..

## Other speed options: Cython, f2py, C

```cython
from numpy cimport ndarray, float64_t

def sum_elements(ndarray[float64_t] arr):
    cdef Py_ssize_t i, n = len(arr)
    cdef float64_t result = 0

    for i in range(n):
        result += arr[i]

    return result
```