# 什麼是 Numpy?
- 用於資料處理
- 底層以 C 和 Fortran 語言實作
- 具備平行處理的能力

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt

## Array
### ndim, shape, dtype

In [None]:
a = np.array([1, 2, 3])   # Create a rank 1 array
print(type(a))            # Prints "<class 'numpy.ndarray'>"
print(a.shape)            # Prints "(3,)"
print(a[0], a[1], a[2])   # Prints "1 2 3"
a[0] = 5                  # Change an element of the array
print(a)                  # Prints "[5, 2, 3]"

In [None]:
b = np.array([[1,2,3],[4,5,6]])    # Create a rank 2 array
print(b.shape)                     # Prints "(2, 3)"
print(b[0, 0], b[0, 1], b[1, 0])   # Prints "1 2 4"

In [None]:
np1 = np.array([1])
np2 = np.array([3, 4, 5])
np3 = np.array([[1, 2, 3], 
               [4, 5, 6]])
print(f"np1 維度 : {np1.ndim}, shape : {np1.shape}, type : {np1.dtype}")
print(f"np2 維度 : {np2.ndim}, shape : {np2.shape}, type : {np2.dtype}")
print(f"np3 維度 : {np3.ndim}, shape : {np3.shape}, type : {np3.dtype}")

## reshape

In [None]:
np3 = np3.reshape([3, 2])
print(f"np3 維度 : {np3.ndim}, shape : {np3.shape}, type : {np3.dtype}")

In [None]:
np3

## Other way to create arrays

In [None]:
a = np.zeros((2,2))   # Create an array of all zeros
print(a)              # Prints "[[ 0.  0.]
                      #          [ 0.  0.]]"

b = np.ones((1,2))    # Create an array of all ones
print(b)              # Prints "[[ 1.  1.]]"

c = np.full((2,2), 7)  # Create a constant array
print(c)               # Prints "[[ 7.  7.]
                       #          [ 7.  7.]]"

d = np.eye(2)         # Create a 2x2 identity matrix
print(d)              # Prints "[[ 1.  0.]
                      #          [ 0.  1.]]"

e = np.random.random((2,2))  # Create an array filled with random values
print(e)                     # Might print "[[ 0.91940167  0.08143941]
                             #               [ 0.68744134  0.87236687]]"

## 索引 & Mask

In [None]:
a = np.arange(1, 13).reshape((3, 4))
a

In [None]:
# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]
b

In [None]:
# Two ways of accessing the data in the middle row of the array.
# Mixing integer indexing with slices yields an array of lower rank,
# while using only slices yields an array of the same rank as the
# original array:
row_r1 = a[1, :]                                     # Rank 1 view of the second row of a
row_r2 = a[1:2, :]                                   # Rank 2 view of the second row of a
print(f"row_r1 : {row_r1}, shape : {row_r1.shape}")  # Prints "[5 6 7 8] (4,)"
print(f"row_r2 : {row_r2}, shape : {row_r2.shape}")  # Prints "[[5 6 7 8]] (1, 4)"

In [None]:
# We can make the same distinction when accessing columns of an array:
col_r1 = a[:, 1]
col_r2 = a[:, 1:2]
print(f"col_r1 : {col_r1}, shape : {col_r1.shape}")  # Prints "[ 2  6 10] (3,)"
print(f"col_r2 : {col_r2}, shape : {col_r2.shape}")  # Prints "[[ 2]
                                                     #          [ 6]
                                                     #          [10]] (3, 1)"

In [None]:
a = np.array([[1,2], [3, 4], [5, 6]])
a

In [None]:
# An example of integer array indexing.
# The returned array will have shape (3,) and
print(a[[0, 1, 2], [0, 1, 0]])  # Prints "[1 4 5]"

In [None]:
# The above example of integer array indexing is equivalent to this:
print(np.array([a[0, 0], a[1, 1], a[2, 0]]))  # Prints "[1 4 5]"

In [None]:
# When using integer array indexing, you can reuse the same
# element from the source array:
print(a[[0, 0], [1, 1]])  # Prints "[2 2]"

In [None]:
# Equivalent to the previous integer array indexing example
print(np.array([a[0, 1], a[0, 1]]))  # Prints "[2 2]"

In [None]:
# Create a new array from which we will select elements
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])

print(a)  # prints "array([[ 1,  2,  3],
          #                [ 4,  5,  6],
          #                [ 7,  8,  9],
          #                [10, 11, 12]])"

In [None]:
# Create an array of indices
b = np.array([0, 2, 0, 1])

# Select one element from each row of a using the indices in b
print(a[np.arange(4), b])  # Prints "[ 1  6  7 11]"

In [None]:
# Mutate one element from each row of a using the indices in b
a[np.arange(4), b] += 10

print(a)  # prints "array([[11,  2,  3],
          #                [ 4,  5, 16],
          #                [17,  8,  9],
          #                [10, 21, 12]])

In [None]:
a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a > 2)   # Find the elements of a that are bigger than 2;
                     # this returns a numpy array of Booleans of the same
                     # shape as a, where each slot of bool_idx tells
                     # whether that element of a is > 2.
bool_idx

In [None]:
# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])  # Prints "[3 4 5 6]"

In [None]:
# We can do all of the above in a single concise statement:
print(a[a > 2])     # Prints "[3 4 5 6]"

## 運算

In [None]:
A = np.array([1,2,3])
B = np.array([4,5,6])
A + B

In [None]:
%%time

A = np.arange(1000000)
B = np.random.random(1000000)
# A*B

for i in range(len(A)):
    A[i] * B[i]

In [None]:
%%time
A * B

In [None]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

In [None]:
# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x, y))

In [None]:
# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))

In [None]:
# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))

In [None]:
# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print(x / y)
print(np.divide(x, y))

In [None]:
# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

In [None]:
v = np.array([9,10])
w = np.array([11, 12])

In [None]:
# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

In [None]:
# Matrix / vector product; both produce the rank 1 array [29 67]
print(x.dot(v))
print(np.dot(x, v))

In [None]:
# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
#  [43 50]]
print(x.dot(y))
print(np.dot(x, y))

In [None]:
x = np.arange(200).reshape(4, 50)
np.sum(x, axis=1)

In [None]:
np.sum(x, axis=0)

In [None]:
x = np.array([[1,2], [3,4]])
print(x)
print(" ")
print(x.T)

## Broadcasting

In [None]:
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
print('x :')
print(x)
print('v :')
print(v)

print('y:')
# Add the vector v to each row of the matrix x with an explicit loop
for i in range(4):
    y[i, :] = x[i, :] + v
print(y)

In [None]:
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
vv = np.tile(v, (4, 1))   # Stack 4 copies of v on top of each other
print('vv :')
print(vv)                 # Prints "[[1 0 1]
                          #          [1 0 1]
                          #          [1 0 1]
                          #          [1 0 1]]"
y = x + vv  # Add x and vv elementwise
print('y :')
print(y)  # Prints "[[ 2  2  4
          #          [ 5  5  7]
          #          [ 8  8 10]
          #          [11 11 13]]"

In [None]:
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = x + v  # Add v to each row of x using broadcasting
print('y :')
print(y)  # Prints "[[ 2  2  4]
          #          [ 5  5  7]
          #          [ 8  8 10]
          #          [11 11 13]]"

In [None]:
v = np.array([1,2,3])  # v has shape (3,)
w = np.array([4,5])    # w has shape (2,)

print(np.reshape(v, (3, 1)) * w)

In [None]:
# Add a vector to each row of a matrix
x = np.array([[1,2,3], [4,5,6]])   # x has shape (2, 3)

print(x + v)

In [None]:
print((x.T + w).T)

In [None]:
print(x + np.reshape(w, (2, 1)))

In [None]:
print(x * 2)

## copy

In [None]:
a = np.arange(5)
a

In [None]:
b = a
c = b
d = a.copy()

In [None]:
b[0] = 99
b

In [None]:
a is b, a is c , a is d

In [None]:
d[-1] = 99
d

In [None]:
a

## Numpy i/o

In [None]:
a = np.array([1, 2, 3])
np.save('./array.npy', a)
np.savetxt('array.txt', a)

In [None]:
b = np.load('./array.npy')
b

In [None]:
c = np.loadtxt('./array.txt')
c

## Demo

In [None]:
# 多元三 T2 data
t2_dir = os.path.join('data', 'T2_20200405')
t2_names = sorted(os.listdir(t2_dir))
t2_00 = np.loadtxt(os.path.join(t2_dir, t2_names[0]))

print(t2_names)
print('\t')
print('2020-04-05 00:00:00 data :')
print(t2_00)
print('\t')
print('data shape :')
print(t2_00.shape)

In [None]:
plt.title(t2_names[0])
plt.imshow(t2_00[:, -1].reshape(47, 63)[::-1, :], cmap='jet')
plt.colorbar()

In [None]:
# load ocean mask array
ocean_mask = np.load('./data/ocean_mask.npy')
ocean_mask.shape

In [None]:
t2_00_mask = np.where(ocean_mask[:, -1]==1, np.nan, t2_00[:, -1])
plt.title(t2_names[0])
plt.imshow(t2_00_mask.reshape(47, 63)[::-1, :], cmap='jet')
plt.colorbar()

- 中壢市區 : 272122.103, 2760570.123
- 三峽山區 : 292874.472, 2752963.760
- 石門水庫 : 275952.606, 2745170.842

In [None]:
# 找出上列三個座標在 T2 arry 的 index
loc = ((272122.103, 2760570.123), (292874.472, 2752963.760), ( 275952.606, 2745170.842))
loc_index = []
for x, y in loc:
    x_d = t2_00[:, 0] - x
    y_d = t2_00[:, 1] - y
    dis = x_d**2 + y_d**2
    loc_index.append(np.argmin(dis))

In [None]:
t2_avg = []
t2_city = []
t2_mountain = []
t2_dam = []

for i in t2_names:
    path = os.path.join(t2_dir, i)
    tmp_arr = np.loadtxt(path)
    tmp_arr_mask = np.where(ocean_mask[:, -1]==1, np.nan, tmp_arr[:, -1])
    
    t2_avg.append(np.nanmean(tmp_arr_mask))
    t2_city.append(tmp_arr_mask[loc_index[0]])
    t2_mountain.append(tmp_arr_mask[loc_index[1]])
    t2_dam.append(tmp_arr_mask[loc_index[2]])

In [None]:
plt.title('2020-04-05 avg Temp')
plt.plot(t2_avg, label = 'avg temp')
plt.plot(t2_city, label = 'city temp')
plt.plot(t2_mountain, label = 'mountain temp')
plt.plot(t2_dam, label = 'dam temp')
plt.legend()

# Pandas

In [None]:
import pandas as pd

## IO

In [None]:
data = pd.read_csv("./data/housepricing/california_housing_test.csv") 
data

In [None]:
# data.to_csv("./data.csv", index=False) 

## Read Data

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.columns

### 檢視特定Column

In [None]:
data['median_house_value']

## Draw figure

In [None]:
data['total_bedrooms'].plot()

In [None]:
data['median_house_value'].hist()

## Series & DataFrame

In [None]:
type(data), type(data['total_rooms'])

## 索引

In [None]:
data[['housing_median_age', 'median_income', 'median_house_value']]

In [None]:
data.loc[[0, 1, 2999], ['housing_median_age', 'median_income', 'median_house_value']]

In [None]:
data.loc[0:3, ['housing_median_age', 'median_income', 'median_house_value']]

In [None]:
data.iloc[[0, 1, -1], [2, 7, -1]]

In [None]:
data[data['median_house_value'] > 450000]

## 數值分析

In [None]:
print(f"total_rooms max : {data['total_rooms'].max()}")
print(f"total_rooms min : {data['total_rooms'].min()}")
print(f"total_rooms mean : {data['total_rooms'].mean()}")
print(f"total_rooms std : {data['total_rooms'].std()}")

In [None]:
data['total_rooms'].describe()

In [None]:
data.corr()

#### 例題:
1. 請找出收入與房屋價值的相關係數?
2. 請求出前兩百筆資料收入與房屋價值的相關係數?

## 常用func

In [None]:
data['housing_median_age'].unique()

In [None]:
data.sort_values(by='median_house_value')

## groupby

In [None]:
data_group = data[['housing_median_age', 'median_income', 'median_house_value']].groupby('housing_median_age', as_index=False)

In [None]:
data_group.mean()

## 資料聚合

In [None]:
df1 = pd.DataFrame({'A':['A0', 'A1', 'A2', 'A3'], 
                    'B':['B0', 'B1', 'B2', 'B3'], 
                    'C':['C0', 'C1', 'C2', 'C3'], 
                    'D':['D0', 'D1', 'D2', 'D3']}, index=[0,1,2,3])

df2 = pd.DataFrame({'A':['A4', 'A5', 'A6', 'A7'], 
                    'B':['B4', 'B5', 'B6', 'B7'], 
                    'C':['C4', 'C5', 'C6', 'C7'], 
                    'D':['D4', 'D5', 'D6', 'D7']}, index=[4,5,6,7])

df3 = pd.DataFrame({'A':['A8', 'A9', 'A10', 'A11'], 
                    'B':['B8', 'B9', 'B10', 'B11'], 
                    'C':['C8', 'C9', 'C10', 'C11'], 
                    'D':['D8', 'D9', 'D10', 'D11']}, index=[8,9,10,11])

`pd.concat`可以依照你想要的維度做資料合併，預設為axis=0

In [None]:
frames = [df1, df2, df3]
result = pd.concat(frames)
result

`axis`控制合併的維度方向，join設定合併的方式

In [None]:
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                    'D': ['D2', 'D3', 'D6', 'D7'],
                    'F': ['F2', 'F3', 'F6', 'F7']}, index=[2, 3, 6, 7])

result = pd.concat([df1, df4], axis=1, join='outer')
result

也可以使用`append`但只能在row(axis=0)方向上做合併

In [None]:
df1.append([df2, df3])

`merge`類似sql語法，可以指定cols內的值合併, `on`決定以哪個columns合併，`how`可以決定合併的方式

In [None]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

result = pd.merge(left, right, on=['key1', 'key2'], how='inner')

In [None]:
left

In [None]:
right

In [None]:
result