# 什麼是 Numpy?
- 用於資料處理
- 底層以 C 和 Fortran 語言實作
- 具備平行處理的能力

In [None]:
import os

import numpy as np
import matplotlib.pyplot as plt

## Array
### ndim, shape, dtype

In [None]:
np1 = np.array([1])
np2 = np.array([3, 4, 5])
np3 = np.array([[1, 2, 3], 
               [4, 5, 6]])
print(f"np1 維度 : {np1.ndim}, shape : {np1.shape}, type : {np1.dtype}")
print(f"np2 維度 : {np2.ndim}, shape : {np2.shape}, type : {np2.dtype}")
print(f"np3 維度 : {np3.ndim}, shape : {np3.shape}, type : {np3.dtype}")

In [None]:
np1 + np2

In [None]:
np2 + np3

### reshape

In [None]:
np3 = np3.reshape([3, 2])
print(f"np3 維度 : {np3.ndim}, shape : {np3.shape}, type : {np3.dtype}")

In [None]:
np3

## One & Zero

In [None]:
np1 = np.zeros([2, 3])
np2 = np.ones([3, 3])

In [None]:
np1

In [None]:
np2

### 索引 & Mask

In [None]:
rand_arr = np.random.randint(0, 256, size=(5, 5))
rand_arr

In [None]:
rand_arr[0]

In [None]:
rand_arr[0, 1]

In [None]:
rand_arr[:2, :2]

In [None]:
rand_arr[::2, ::2]

In [None]:
rand_arr > 128

In [None]:
rand_arr[rand_arr > 128]

## 運算

In [None]:
print(f"rand_arr sum : {rand_arr.sum()}")
print(f"rand_arr mean : {rand_arr.mean()}")
print(f"rand_arr std : {rand_arr.std()}")
print(f"rand_arr max : {rand_arr.max()}")
print(f"rand_arr min : {rand_arr.min()}")

In [None]:
rand_arr.sum(axis=0)

In [None]:
rand_arr.sum(axis=1)

## copy

In [None]:
a = np.arange(5)
a

In [None]:
b = a
c = b
d = a.copy()

In [None]:
b[0] = 99
b

In [None]:
a is b, a is c , a is d

In [None]:
d[-1] = 99
d

In [None]:
a

## Numpy i/o

In [None]:
a = np.array([1, 2, 3])
np.save('./array.npy', a)
np.savetxt('array.txt', a)

In [None]:
b = np.load('./array.npy')
b

In [None]:
c = np.loadtxt('./array.txt')
c

## Demo

In [None]:
# 多元三 T2 data
t2_dir = os.path.join('data', 'T2_20200405')
t2_names = sorted(os.listdir(t2_dir))
t2_00 = np.loadtxt(os.path.join(t2_dir, t2_names[0]))

print(t2_names)
print('\t')
print('2020-04-05 00:00:00 data :')
print(t2_00)
print('\t')
print('data shape :')
print(t2_00.shape)

In [None]:
plt.title(t2_names[0])
plt.imshow(t2_00[:, -1].reshape(47, 63)[::-1, :], cmap='jet')
plt.colorbar()

In [None]:
# load ocean mask array
ocean_mask = np.load('./data/ocean_mask.npy')
ocean_mask.shape

In [None]:
t2_00_mask = np.where(ocean_mask[:, -1]==1, np.nan, t2_00[:, -1])
plt.title(t2_names[0])
plt.imshow(t2_00_mask.reshape(47, 63)[::-1, :], cmap='jet')
plt.colorbar()

- 中壢市區 : 272122.103, 2760570.123
- 三峽山區 : 292874.472, 2752963.760
- 石門水庫 : 275952.606, 2745170.842

In [None]:
# 找出上列三個座標在 T2 arry 的 index
loc = ((272122.103, 2760570.123), (292874.472, 2752963.760), ( 275952.606, 2745170.842))
loc_index = []
for x, y in loc:
    x_d = t2_00[:, 0] - x
    y_d = t2_00[:, 1] - y
    dis = x_d**2 + y_d**2
    loc_index.append(np.argmin(dis))

In [None]:
t2_avg = []
t2_city = []
t2_mountain = []
t2_dam = []

for i in t2_names:
    path = os.path.join(t2_dir, i)
    tmp_arr = np.loadtxt(path)
    tmp_arr_mask = np.where(ocean_mask[:, -1]==1, np.nan, tmp_arr[:, -1])
    
    t2_avg.append(np.nanmean(tmp_arr_mask))
    t2_city.append(tmp_arr_mask[loc_index[0]])
    t2_mountain.append(tmp_arr_mask[loc_index[1]])
    t2_dam.append(tmp_arr_mask[loc_index[2]])

In [None]:
plt.title('2020-04-05 avg Temp')
plt.plot(t2_avg, label = 'avg temp')
plt.plot(t2_city, label = 'city temp')
plt.plot(t2_mountain, label = 'mountain temp')
plt.plot(t2_dam, label = 'dam temp')
plt.legend()

# Pandas

In [None]:
import pandas as pd

## IO

In [None]:
data = pd.read_csv("./data/housepricing/california_housing_test.csv") 
data

In [None]:
# data.to_csv("./data.csv", index=False) 

## Read Data

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.columns

### 檢視特定Column

In [None]:
data['median_house_value']

## Draw figure

In [None]:
data['total_bedrooms'].plot()

In [None]:
data['median_house_value'].hist()

## Series & DataFrame

In [None]:
type(data), type(data['total_rooms'])

## 索引

In [None]:
data[['housing_median_age', 'median_income', 'median_house_value']]

In [None]:
data.loc[[0, 1, 2999], ['housing_median_age', 'median_income', 'median_house_value']]

In [None]:
data.loc[0:3, ['housing_median_age', 'median_income', 'median_house_value']]

In [None]:
data.iloc[[0, 1, -1], [2, 7, -1]]

In [None]:
data[data['median_house_value'] > 450000]

## 數值分析

In [None]:
print(f"total_rooms max : {data['total_rooms'].max()}")
print(f"total_rooms min : {data['total_rooms'].min()}")
print(f"total_rooms mean : {data['total_rooms'].mean()}")
print(f"total_rooms std : {data['total_rooms'].std()}")

In [None]:
data['total_rooms'].describe()

In [None]:
data.corr()

#### 例題:
1. 請找出收入與房屋價值的相關係數?
2. 請求出前兩百筆資料收入與房屋價值的相關係數?

## 資料聚合

In [None]:
df1 = pd.DataFrame({'A':['A0', 'A1', 'A2', 'A3'], 
                    'B':['B0', 'B1', 'B2', 'B3'], 
                    'C':['C0', 'C1', 'C2', 'C3'], 
                    'D':['D0', 'D1', 'D2', 'D3']}, index=[0,1,2,3])

df2 = pd.DataFrame({'A':['A4', 'A5', 'A6', 'A7'], 
                    'B':['B4', 'B5', 'B6', 'B7'], 
                    'C':['C4', 'C5', 'C6', 'C7'], 
                    'D':['D4', 'D5', 'D6', 'D7']}, index=[4,5,6,7])

df3 = pd.DataFrame({'A':['A8', 'A9', 'A10', 'A11'], 
                    'B':['B8', 'B9', 'B10', 'B11'], 
                    'C':['C8', 'C9', 'C10', 'C11'], 
                    'D':['D8', 'D9', 'D10', 'D11']}, index=[8,9,10,11])

`pd.concat`可以依照你想要的維度做資料合併，預設為axis=0

In [None]:
frames = [df1, df2, df3]
result = pd.concat(frames)
result

`axis`控制合併的維度方向，join設定合併的方式

In [None]:
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                    'D': ['D2', 'D3', 'D6', 'D7'],
                    'F': ['F2', 'F3', 'F6', 'F7']}, index=[2, 3, 6, 7])

result = pd.concat([df1, df4], axis=1, join='outer')
result

也可以使用`append`但只能在row(axis=0)方向上做合併

In [None]:
df1.append([df2, df3])

`merge`類似sql語法，可以指定cols內的值合併, `on`決定以哪個columns合併，`how`可以決定合併的方式

In [None]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

result = pd.merge(left, right, on=['key1', 'key2'], how='inner')

In [None]:
left

In [None]:
right

In [None]:
result

# Demo

In [None]:
import datetime

In [None]:
now = datetime.datetime.now()
now = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute//10*10)
start_time = now - datetime.timedelta(hours=3) + datetime.timedelta(minutes=10)

print(f"Time from {start_time} to {now}")

In [None]:
time_interval = 10
time_list = pd.date_range(start_time, now, freq=f'{time_interval}T')
data_df = pd.DataFrame(time_list, columns=['time'])
data_df

In [None]:
time_interval = 15
time_list = pd.date_range(start_time-datetime.timedelta(hours=1), now, freq=f'{time_interval}T')
water_level = pd.DataFrame(time_list, columns=['time'])
water_level['level'] = np.random.randint(10, 20, water_level.shape[0])

In [None]:
water_level

In [None]:
wl_data_df = data_df.merge(water_level, on='time', how='left')
wl_data_df

In [None]:
wl_data_df.interpolate(limit_direction='both')

In [None]:
plt.plot(wl_data_df['time'], wl_data_df['level'].interpolate(limit_direction='both'))