# `numpy` tutorial for my penguin

大千世界，其实都是信息，而信息都可以表示为二进制数字，所以数字和数组就非常重要了。`numpy`是numerical python的简称，是专门与“数”打交道的工具。之前说过python语言的唯一缺点就是太慢，为了达到快速运算的目的，`numpy`核心是用C语言写成的。

## 数组 Array
`numpy`中最重要的是数组 (array)。我们先认识一下数组。

In [None]:
import numpy as np

In [None]:
py_list = list([1, 2, 3])
np_list = np.array([1, 2, 3], dtype=np.float)

In [None]:
py_list, np_list

In [None]:
np_list.dtype

In [None]:
mask = np.array([[True, True, False],
                 [False, True, True],
                 [True, False, False]])
mask.dtype

In [None]:
mask = mask.astype(int)
mask

### 建立数组的几个方便办法

In [None]:
zero_array = np.zeros([3, 2])
zero_array

In [None]:
zero_array = np.zeros((3, 2))
zero_array

In [None]:
print(zero_array.shape)
print(zero_array.ndim) # column
print(len(zero_array)) # row
print(zero_array.size) # total size

In [None]:
one_array = np.ones_like(zero_array)
one_array

In [None]:
print(np.diag((1, 1, 3, 5)))
print('\n')
print(np.eye(3))

In [None]:
np.arange(0, 10, 0.5)

In [None]:
np.linspace(0, 10, 20, endpoint=False)

### 数组的变形 (reshape)

In [None]:
array = np.zeros([10, 5])
print(array.shape) # (row, col), consistent with linear algebra convention
array

In [None]:
print(array.T.shape)
array.T

In [None]:
array = np.linspace(0, 10, 20, endpoint=False)
array.shape

In [None]:
array.T

In [None]:
array = array.reshape((1, 20))
array.shape

In [None]:
array = np.linspace(0, 10, 20, endpoint=False)
print(array[np.newaxis, :])
print(array[:, np.newaxis])

In [None]:
array.T

In [None]:
array = np.linspace(0, 10, 20, endpoint=False)
array.reshape((4, 5))

In [None]:
array.reshape((5, 4))

### 数组的切割 (slice)

In [None]:
x = np.linspace(0, 10, 20, endpoint=False)
print(x)

In [None]:
x[1], x[2], x[3]

In [None]:
x[1:4]

In [None]:
x[-1], x[-2]

In [None]:
x[-4:-1]

In [None]:
x[14:-1], x[14:]

In [None]:
x[1::3]

In [None]:
x[::4] # seasonal

In [None]:
y = np.arange(0, 20, 1).reshape(4, 5).T
y

In [None]:
y[0, 1]

In [None]:
y[:, -1]

In [None]:
y[::2, 1:3]

In [None]:
y[0, 0] = 99
y

In [None]:
y = np.arange(0, 20, 1).reshape(4, 5).T
z = np.arange(20, 40, 1).reshape(5, 4)
y

In [None]:
np.sum(y, axis=0)

In [None]:
np.sum(y, axis=1)

In [None]:
np.sum(y)

In [None]:
np.concatenate([y, z], axis=0)

In [None]:
np.vstack([y, z])

In [None]:
np.concatenate([y, z], axis=1)

In [None]:
np.hstack([y, z])

In [None]:
y

In [None]:
a, b, c = np.split(y, [1, 3], axis=0)
print(a)
print(b)
print(c)

In [None]:
a, b = np.vsplit(y, [2])
print(a)
print(b)

In [None]:
a, b = np.hsplit(y, [2])
print(a)d
print(b)

### 一些运算

In [None]:
iq = np.random.normal(loc=100, scale=25, size=(3, 50))
iq[:, :4]
# IQ of the same group in three different epochs

In [None]:
# descriptive stats of the whole sample
np.mean(iq), np.median(iq), np.std(iq, ddof=1), np.sum(iq) / np.size(iq)

In [None]:
print('Mean IQ = {0} \n std IQ = {1}'.format(
        np.mean(iq, axis=1), np.std(iq, axis=1, ddof=1)))

In [None]:
iq.mean(), iq.median()

In [None]:
np.array([np.mean(iq, axis=1), np.std(iq, axis=1, ddof=1)]).T

In [None]:
np.max(iq), np.max(iq, axis=1)

In [None]:
from scipy.stats import describe
describe(iq, axis=1)

In [None]:
np.min(iq[:, 0])

In [None]:
np.argmin(iq[:, 0])

In [None]:
np.where(iq==np.min(iq[:, 0]))

### Mask 

==; <=; >=; >; <; !=

In [None]:
zz = iq < 60
zz

In [None]:
if np.any(zz):
    print('We have idiots.')
    print('We have {} idiots'.format(np.sum(zz)))
    print('We have {} idiots'.format(np.sum(zz, axis=1)))
else:
    print("We don't have idiots.")

In [None]:
zz[0] & zz[1]

In [None]:
np.logical_and(zz[0], zz[1])

In [None]:
np.logical_and.reduce([zz[0], zz[1], zz[2]])

In [None]:
import pandas as pd
df = pd.read_json('./meander.json')

In [None]:
new_year_mask = (df['date'] >= pd.datetime(2020, 1, 1))
df[new_year_mask]['year'].median()

In [None]:
df['date'][0].year

In [None]:
comm_mask = ~(np.array(df['comment']) == None)
df[comm_mask][['movie_name', 'comment']]

## Broadcasting

In [None]:
x = np.arange(-5, 5, 0.5)
y = x.reshape((-1, 1))
(x * y).shape

In [None]:
min(x)

In [None]:
import matplotlib.pyplot as plt
plt.imshow(x * y, origin='lower', 
           extent=[min(x), max(x), min(y), max(y)])
plt.xlabel('x')
plt.ylabel('y')
plt.show()

## Fancy indexing

In [None]:
x = np.random.randint(low=0, high=100, size=30).reshape(6, 5)

In [None]:
x

In [None]:
x[2, [0, 2, 4]]

In [None]:
x.flatten()

In [None]:
ind = np.array([[1, 2],
                [3, 2]])
x.flatten()[ind]

In [None]:
x[ind]

In [None]:
x[[1, 3], [0, 2]]

In [None]:
x

In [None]:
row = np.array([0, 1, 2]).reshape(-1, 1)
x[row, np.array([True, False, True, True, False])]

In [None]:
np.argsort(x, axis=0)