In [1]:
import numpy as np
import pandas as pd
import csv


In [31]:
# 加载训练数据
train_data = pd.read_csv('train.csv', encoding='Big5')
test_data = pd.read_csv('test.csv', encoding='Big5')

# 数据处理：我们将提取 PM2.5 列以及前 9 小时作为特征
# 训练数据中每个月20天，每天24小时，共有12个月
def process_train_data(train_data):
    data = train_data.iloc[:, 3:].to_numpy()  # 从第4列开始取观测数据
    data[data == 'NR'] = 0  # 将 'NR' 替换为 0
    data = data.astype(float)  # 将数据类型转为浮点数
    
    month_data = {}
    for month in range(12):
        month_data[month] = np.empty((18, 480))  # 每月 20 天 * 24 小时 = 480 小时
        for day in range(20):
            month_data[month][:, day * 24: (day + 1) * 24] = data[18 * (20 * month + day): 18 * (20 * month + day + 1), :]
    
    x_train = []
    y_train = []
    
    for month in range(12):
        for i in range(471):  # 每个月有 471 个样本，每10小时取一次样本
            x_train.append(month_data[month][:, i:i + 9].reshape(-1))
            y_train.append(month_data[month][9, i + 9])  # 第10小时的PM2.5作为y值
    
    x_train = np.array(x_train)  # (5652, 162)
    y_train = np.array(y_train)  # (5652,)
    
    return x_train, y_train


UnicodeDecodeError: 'big5' codec can't decode byte 0xc8 in position 0: illegal multibyte sequence

In [None]:
# 加载测试数据并处理
def process_test_data(test_data):
    test_x = []
    for i in range(240):  # 总共有240笔测试数据
        test_x.append(test_data.iloc[18 * i: 18 * (i + 1), 2:].to_numpy().reshape(-1))
    test_x = np.array(test_x)
    return test_x



In [None]:
# 特征归一化
def normalize(x):
    mean_x = np.mean(x, axis=0)  # 按列计算均值
    std_x = np.std(x, axis=0)    # 按列计算标准差
    for i in range(len(x)):
        x[i] = (x[i] - mean_x) / std_x  # 标准化
    return x, mean_x, std_x


In [None]:
# Adagrad算法实现线性回归
def train(x_train, y_train, learning_rate=0.01, iterations=10000):
    dim = x_train.shape[1] + 1  # 加1是为了bias
    w = np.zeros(dim)  # 初始化权重
    x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)  # 加一列bias

    adagrad = np.zeros(dim)  # Adagrad的累积梯度
    eps = 1e-8  # 平滑项
    for t in range(iterations):
        loss = np.dot(x_train, w) - y_train  # 预测误差
        gradient = 2 * np.dot(x_train.T, loss) / x_train.shape[0]  # 计算梯度
        adagrad += gradient**2  # 累加梯度平方
        w -= learning_rate * gradient / np.sqrt(adagrad + eps)  # 更新权重
        
        if t % 1000 == 0:
            cost = np.sqrt(np.mean(loss**2))  # RMSE
            print(f"Iteration {t}: Cost = {cost}")
    
    return w

# 模型预测
def predict(x_test, w, mean_x, std_x):
    x_test = (x_test - mean_x) / std_x  # 标准化
    x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)  # 加一列bias
    return np.dot(x_test, w)


In [None]:
# 主函数
x_train, y_train = process_train_data(train_data)
x_train, mean_x, std_x = normalize(x_train)

# 训练模型
w = train(x_train, y_train)

# 处理测试数据并预测
x_test = process_test_data(test_data)
y_test_pred = predict(x_test, w, mean_x, std_x)

# 保存结果
with open('/mnt/data/predict.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'value'])
    for i in range(len(y_test_pred)):
        writer.writerow([f'id_{i}', y_test_pred[i]])

print("预测完成，结果已保存为 predict.csv 文件。")