In [1]:
# 导入必要的库
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [2]:
# 读取数据集
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
# 去掉Id特征
train = train.drop(['Id'], axis=1)
test = test.drop(['Id'], axis=1)

In [4]:
# 对于缺失值很多的特征进行处理
train = train.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1)
test = test.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1)

# 处理数值特征的缺失值
numerical_features = train.select_dtypes(include=[np.number])
numerical_test_features = test.select_dtypes(include=[np.number])

train = train.fillna(numerical_features.mean())
test = test.fillna(numerical_test_features.mean())

In [5]:
# 对分类特征进行编码
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 对训练数据集进行准备
X_train = train.drop(['SalePrice'], axis=1).values
y_train = train['SalePrice'].values.reshape(-1, 1)

# 训练集和验证集的划分
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [6]:
# 将数据集转换为序列
def create_sequences(data, seq_length):
    X = []
    y = []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

In [7]:
seq_length = 50

X_train_seq, y_train_seq = create_sequences(X_train, seq_length)
X_val_seq, y_val_seq = create_sequences(X_val, seq_length)

# 将数据转化为 PyTorch 张量
X_train_seq_tensor = torch.from_numpy(X_train_seq).type(torch.Tensor)
X_val_seq_tensor = torch.from_numpy(X_val_seq).type(torch.Tensor)
y_train_seq_tensor = torch.from_numpy(y_train_seq).type(torch.Tensor)
y_val_seq_tensor = torch.from_numpy(y_val_seq).type(torch.Tensor)

In [8]:
# 构建 LSTM 模型
class LSTMModel(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().to(device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :]) 
        return out


In [9]:
# 设置超参数
input_dim = X_train_seq_tensor.shape[-1]
hidden_dim = 256
num_layers = 2
output_dim = 1
learning_rate = 0.001
num_epochs = 100

In [10]:
# 初始化 LSTM 模型并设置 CPU or GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers, output_dim=output_dim).to(device)

# 定义损失函数和优化器
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
# 训练 LSTM 模型
train_loss_seq = []
val_loss_seq = []

In [12]:
for epoch in range(num_epochs):
    inputs = X_train_seq_tensor.to(device)
    labels = y_train_seq_tensor.to(device)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    train_loss_seq.append(loss.item())
    
    val_inputs = X_val_seq_tensor.to(device)
    val_labels = y_val_seq_tensor.to(device)
    val_outputs = model(val_inputs)
    val_loss = criterion(val_outputs, val_labels)
    val_loss_seq.append(val_loss.item())

    if epoch % 10 == 0:
        print("Epoch: %d, Train loss: %1.5f, Validation loss: %1.5f" % (epoch, loss.item(), val_loss.item()))


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0, Train loss: 947345.37500, Validation loss: 634614.06250
Epoch: 10, Train loss: 946070.43750, Validation loss: 633345.62500
Epoch: 20, Train loss: 945334.43750, Validation loss: 632679.50000
Epoch: 30, Train loss: 944855.62500, Validation loss: 632217.62500
Epoch: 40, Train loss: 944423.75000, Validation loss: 631799.06250
Epoch: 50, Train loss: 944022.31250, Validation loss: 631410.00000
Epoch: 60, Train loss: 943648.68750, Validation loss: 631048.31250
Epoch: 70, Train loss: 943300.68750, Validation loss: 630711.81250
Epoch: 80, Train loss: 942976.00000, Validation loss: 630397.68750
Epoch: 90, Train loss: 942672.68750, Validation loss: 630104.93750


In [13]:
# 使用训练好的模型进行预测
test_features = test.values

In [14]:
# 处理测试集的缺失值
test_numerical_features = test.select_dtypes(include=[np.number])
test = test.fillna(test_numerical_features.mean())
test_categorical_features = test.select_dtypes(include=[np.object])
test = test.fillna(test_categorical_features.mode().iloc[0])

# 对测试集进行转换
test_seq_features, test_seq_labels = create_sequences(test, np.zeros((len(test), 1)))
test_seq_features_tensor = torch.from_numpy(test_seq_features).type(torch.Tensor).to(device)


  test_categorical_features = test.select_dtypes(include=[np.object])


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations