# HomeWord 1 : Linear Regression
本次实验的目标： 由前9个小时的18个特征预测第10个小时的PM2.5
[参考](https://colab.research.google.com/drive/131sSqmrmWXfjFZ3jWSELl8cm0Ox5ah3C#scrollTo=NzvXP5Jya64j)

In [None]:
import sys
import pandas as pd
import numpy as np
from  tqdm import tqdm

## Preprocssing 数据预处理
取需要的数值部分，将'RAINFALL'全部设置为0

In [None]:
data = pd.read_csv('./train.csv', encoding='big5')

In [None]:
data.head()

In [None]:
# 抛去数据的前三列不要了
data = data.iloc[:, 3:]
# 将为NR的数据变为 0
data[data == 'NR'] = 0
raw_data = data.to_numpy()

## Extract Features 特征提取 - 1 
将原始的4320*18的资料依照每个月组成12个月 18*480的数据类型(12*18*480) 其中18是feature 480 是 20*24 得来的

In [None]:
month_data = {}
for month in range(12):
    sample = np.empty([18, 480]) # 创建一个空样本大小为18*480
    for day in range(20): # 一个月只取了20天
        # 从raw_data中取数据 将数据填充进 sample
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
    month_data[month] = sample

## Extract Features 特征提取 - 2
每个月有480小时，每9个小时为一个data，每个月会有471个data，故总资料数为471\*12笔，而每笔数据有9\*18的feature(一个小时18个feature)

对应的target则有471 * 12 个 （第10个小时的PM2.5）

In [None]:
# 定义输入输出的大小
x = np.empty([12 * 471, 18 * 9], dtype=float)
y = np.empty([12 * 471, 1], dtype = float)

for month in range(12):
    for day in range(20):
        for hour in range(24):
            # 防止今天的数据跑到明天去
            if day == 19 and hour > 14:
                continue
            # 输入的sample大小为9*18
            x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1,-1)
            y[month * 471 + day + hour, 0] = month_data[month][9, day * 24 + hour + 9]


## Normalize 归一化

In [None]:
mean_x = np.mean(x, axis=0)
std_x = np.std(x, axis= 0)
for i in range(len(x)):
    for j in range(len(x[0])):
        if std_x[j] != 0:
            x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]

上面的数据处理部分已懵逼！ 需要再看看咋做的 并想为啥这样操作

## Split Training Data Into "train_set" and "validation_set"
生成训练用的数据集train_set和验证数据局validation_set



In [None]:
import math
# floor() 返回数字的下舍整数。 取百分之80做训练集 剩下的 百分之20做测试集
x_train_set = x[: math.floor(len(x) * 0.8), :]
y_train_set = y[: math.floor(len(y) * 0.8), :]
x_validation = x[math.floor(len(x) * 0.8):, :]
y_validation = y[math.floor(len(x) * 0.8):, :]

## Training
使用Adagrad算法作为优化器


## Testing
导入test data 并且以训练集的方式预处理，使得测试数据形成240个维度为18\*9+1

In [None]:
# 因为常数项的存在所以dim要多加一个维度
dim = 18 * 9 + 1
w = np.zeros([dim, 1])
x = np.concatenate((np.ones([12 * 471, 1]), x), axis = 1).astype(float)

# 设置学习率
learning_rate = 200 
# 迭代次数
iter_time = 1000000
# adagrad 
adagrad = np.zeros([dim, 1])
eps = 0.000000001
last_loss = 0
for t in tqdm(range(iter_time)):
    # rmse
    # loss = np.sqrt(np.sum(np.power(np.dot(x, w) - y, 2))/471/12)         
    loss = np.sqrt(np.sum(np.power(np.dot(x,w)-y,2))/len(x))
    last_loss = loss
    # # 计算梯度 这里的梯度怎么算的?? 
    # gradient = 2 * np.dot(x.transpose(), np.dot(x, w) - y)
    # # adgrade 一路走来的梯度平方和  
    # adagrad += gradient ** 2
    # # 更新参数 adgeade 
    # w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
    gradient = (np.dot(x.T,np.dot(x,w)-y))/(loss)
    adagrad += (gradient ** 2)
    w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
    if t == 1:
        print("Loss:" + str(loss))

print("\n" + "Loss:" + str(last_loss))
# 保存权重
np.save('weight.npy', w)

In [None]:
#因为存在偏差bias，所以dim+1
dim = 18 * 9 + 1
# w维度为163*1
w = np.zeros([dim,1])
# x_train_set维度为 4521*163
x_train_set= np.concatenate((np.ones([len(x_train_set),1]),x_train_set),axis = 1).astype(float)
#设置学习率
learning_rate = 10
#设置迭代数
iter_time = 30000
#让dw值初始化
adagrad = np.zeros([dim,1])
eps = 0.0000000001
for t in range(iter_time):
    loss = np.sqrt(np.sum(np.power(np.dot(x_train_set,w)-y_train_set,2))/len(x_train_set))
    if(t%100 == 0):
        print("迭代的次数：%i ， 损失值：%f"%(t,loss))
        gradient = (np.dot(x_train_set.T,np.dot(x_train_set,w)-y_train_set))/(loss*len(x_train_set))
        adagrad += (gradient ** 2)
        w = w - learning_rate * gradient / np.sqrt(adagrad + eps)
#保存参数w
np.save('weight.npy',w)

In [None]:
# loss = np.sqrt(np.sum(np.power(np.dot(x,w)-y,2))/len(x_train_set))

# gradient = (np.dot(x.T,np.dot(x,w)-y))/(loss*len(x))
# gradient = 2 * np.dot(x.transpose(), np.dot(x, w) - y)

In [None]:
testdata = pd.read_csv('./test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)

# 归一化
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)

## Prediction 



In [None]:
# 导入权重
w = np.load('weight.npy')
# 解决输出
ans_y = np.dot(test_x, w)

## Save Prediction to CSV File

In [None]:
import csv
with open('prediction.csv', mode='w', newline='') as prediction_file:
    csv_writer = csv.writer(prediction_file)
    header = ['id', 'value']
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)

## 总结
结果好像不是很好哎，因为预测的pm2.5的值竟然有负数！！ 这是不可能的啊
如何优化呢， 选择不同的优化器，是不用同的model（LSTM）

In [None]:
count = 0
for i in range(len(ans_y)):
    if (ans_y[i] < 0):
        count += 1

count/len(ans_y)