In [None]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

In [None]:
def create_dataset():
    """构造数据集"""
    # 读取数据
    data = pd.read_csv("data/house_prices.csv")
    # 去除无关特征
    data.drop(["Id"], axis=1, inplace=True)
    # 划分特征和目标
    X = data.drop("SalePrice", axis=1)
    y = data["SalePrice"]
    # 筛选出数值型特征
    numerical_features = X.select_dtypes(exclude="object").columns
    # 筛选出类别型特征
    categorical_features = X.select_dtypes(include="object").columns
    # 划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # 特征预处理
    #   数值型特征先用平均值填充缺失值，再进行标准化
    numerical_transformer = Pipeline(
        steps=[
            ("fillna", SimpleImputer(strategy="mean")),
            ("std", StandardScaler()),
        ]
    )
    #   类别型特征先将缺失值替换为字符串"NaN"，再进行独热编码
    categorical_transformer = Pipeline(
        steps=[
            ("fillna", SimpleImputer(strategy="constant", fill_value="NaN")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )
    #   组合特征预处理器
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    #   特征预处理
    x_train = pd.DataFrame(preprocessor.fit_transform(x_train).toarray(), columns=preprocessor.get_feature_names_out())
    x_test = pd.DataFrame(preprocessor.transform(x_test).toarray(), columns=preprocessor.get_feature_names_out())
    # 构建数据集
    train_dataset = TensorDataset(torch.tensor(x_train.values).float(), torch.tensor(y_train.values).float())
    test_dataset = TensorDataset(torch.tensor(x_test.values).float(), torch.tensor(y_test.values).float())
    # 返回训练集，测试集，特征数量
    return train_dataset, test_dataset, x_train.shape[1]

In [None]:
class HousePriceModel(nn.Module):
    """房价预测模型"""

    def __init__(self, input_num):
        super(HousePriceModel, self).__init__()
        self.linear1 = nn.Linear(input_num, 128)
        self.batchnorm1d = nn.BatchNorm1d(128)
        self.linear2 = nn.Linear(128, 1)

    def forward(self, input):
        output = self.linear1(input)
        output = self.batchnorm1d(output)
        output = torch.relu(output)
        output = torch.dropout(output, 0.2, True)
        output = self.linear2(output)
        return output

In [None]:
# 房价预测时我们更加关心相对误差非绝对误差，因此这里使用对数的均方跟误差作为损失函数
def log_rmse(pred, target):
    """对数均方根误差"""
    mse = nn.MSELoss()
    pred.squeeze_()
    pred = torch.clamp(pred, 1, float("inf"))  # 限制预测值在1到正无穷之间
    return torch.sqrt(mse(torch.log(pred), torch.log(target)))


loss_list = []


def train(model, train_dataset, loss_list):
    """模型训练"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    # 准备数据
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    # 训练模型
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-1)
    for epoch in range(200):
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss_value = log_rmse(output, y)
            loss_value.backward()
            optimizer.step()
            optimizer.zero_grad()
        loss_list.append(loss_value.item())
        if epoch % 20 == 0:
            print("epoch: {}, loss: {}".format(epoch, loss_value.item()))
    return model


train_dataset, test_dataset, input_num = create_dataset()
model = HousePriceModel(input_num)
model = train(model, train_dataset, loss_list)
# 获取模型所在的设备
device = next(model.parameters()).device
plt.plot(loss_list)
print(log_rmse(model(test_dataset.tensors[0].to(device)), test_dataset.tensors[1].to(device)))