导入模块

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

读取并处理数据

In [2]:
# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess data
def preprocess_data(data):
    data = data.select_dtypes(include=[np.number]).interpolate().dropna()
    return data

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

X = train_data.drop(['Id', 'SalePrice'], axis=1)
X_test = test_data.drop('Id', axis=1)
y = train_data['SalePrice']

# Standardize data
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)

定义网络

In [3]:
# Define the model
class HousePriceModel(nn.Module):
    def __init__(self, input_dim):
        super(HousePriceModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

input_dim = X.shape[1]

# Define loss function
criterion = nn.MSELoss()

K折交叉验证

In [4]:
# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    model = HousePriceModel(input_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.005)
    
    # Train the model
    epochs = 100
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
    model.eval()
    outputs = model(X_train)
    loss = criterion(outputs, y_train) / len(y_train)
    val_outputs = model(X_val)
    val_loss = criterion(val_outputs, y_val) / len(y_val)
    
    print(f'Fold {fold}, Train loss: {loss.item():.4f}, Val loss: {val_loss.item():.4f}')
    fold += 1

Fold 1, Train loss: 660216.8125, Val loss: 3923631.0000
Fold 2, Train loss: 615813.5000, Val loss: 4717982.0000
Fold 3, Train loss: 381977.7812, Val loss: 15853975.0000
Fold 4, Train loss: 723709.8125, Val loss: 3134548.0000
Fold 5, Train loss: 720875.0000, Val loss: 1882500.6250


使用完整的训练数据集来重新训练模型，并进行预测。

In [5]:
train_dataset = TensorDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
print(len(train_loader))
# Train the model
model = HousePriceModel(input_dim)
optimizer = optim.Adam(model.parameters(), lr=0.005)
epochs = 100
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        model.eval()
        outputs = model(X)
        loss = criterion(outputs, y) / len(y)
        print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}')

# Make predictions on the test set
model.eval()
predictions = model(X_test).detach().numpy()

# Save predictions
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': predictions.flatten()})
submission.to_csv('submission.csv', index=False)

92
Epoch 10, Loss: 1697958.0000
Epoch 20, Loss: 906945.7500
Epoch 30, Loss: 707725.4375
Epoch 40, Loss: 611941.6875
Epoch 50, Loss: 560294.6875
Epoch 60, Loss: 524649.0625
Epoch 70, Loss: 502509.9375
Epoch 80, Loss: 484545.3438
Epoch 90, Loss: 474134.0938
Epoch 100, Loss: 456421.1250
