## 数据准备

In [18]:
import pandas as pd
import numpy as np
import torch

In [19]:
train_file_path = 'data/train.csv'
test_file_path = 'data/test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

## 数据预处理

In [20]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

print(train_data.shape, test_data.shape, all_features.shape)

(1460, 81) (1459, 80) (2919, 79)


In [21]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

print(numeric_features, len(numeric_features))

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object') 36


In [22]:
# 标准化数值特征
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [None]:
all_features = pd.get_dummies(all_features, dummy_na=True).astype('float32')
print(all_features.shape)


(2919, 330)


In [48]:
n_train = train_data.shape[0]
X_train = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
X_test = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
y_train = torch.tensor(train_data.SalePrice.values, dtype=torch.float32).view(-1, 1)
y_train = torch.log(y_train)

print(X_train[:5], y_train[:5])
print(X_train.shape, y_train.shape, X_test.shape)

tensor([[ 0.0673, -0.1844, -0.2178,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8735,  0.4581, -0.0720,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.0673, -0.0559,  0.1372,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.3025, -0.3986, -0.0784,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0673,  0.6294,  0.5188,  ...,  1.0000,  0.0000,  0.0000]]) tensor([[12.2477],
        [12.1090],
        [12.3172],
        [11.8494],
        [12.4292]])
torch.Size([1460, 330]) torch.Size([1460, 1]) torch.Size([1459, 330])


## 建立 torch 的 Dataset 和 DataLoader

In [49]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [50]:
class TrainDataset(Dataset):
    def __init__(self, X, y):
        assert len(X) == len(y)
        self.X, self.y = X, y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [51]:
train_dataset = TrainDataset(X_train, y_train)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for X, y in train_dataloader:
  print(f"Shape of X: {X.shape}")
  print(f"Shape of y: {y.shape}")
  break

Shape of X: torch.Size([64, 330])
Shape of y: torch.Size([64, 1])


## 建立模型

In [62]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
print(f"Using {device} device")

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(all_features.columns), 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )
        # self.linear_relu_stack = nn.Sequential(
        #     nn.Linear(X_train.shape[1], 1),
        # )
        
    def forward(self, x):
        price = self.linear_relu_stack(x)
        return price
    
model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=330, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1024, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=64, bias=True)
    (7): ReLU()
    (8): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [63]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [64]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [65]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 146.960281  [   64/ 1460]
Epoch 2
-------------------------------
loss: 1.164440  [   64/ 1460]
Epoch 3
-------------------------------
loss: 0.686973  [   64/ 1460]
Epoch 4
-------------------------------
loss: 0.065881  [   64/ 1460]
Epoch 5
-------------------------------
loss: 0.035463  [   64/ 1460]
Epoch 6
-------------------------------
loss: 0.041017  [   64/ 1460]
Epoch 7
-------------------------------
loss: 0.027945  [   64/ 1460]
Epoch 8
-------------------------------
loss: 0.030017  [   64/ 1460]
Epoch 9
-------------------------------
loss: 0.022869  [   64/ 1460]
Epoch 10
-------------------------------
loss: 0.021360  [   64/ 1460]
Epoch 11
-------------------------------
loss: 0.016145  [   64/ 1460]
Epoch 12
-------------------------------
loss: 0.018803  [   64/ 1460]
Epoch 13
-------------------------------
loss: 0.009690  [   64/ 1460]
Epoch 14
-------------------------------
loss: 0.013684  [   64/ 1460]
Epoch 15
----

In [66]:
model.eval()
with torch.no_grad():
    train_pred = model(X_train[:10])
    for i in range(10):
        print(f"Predicted: {train_pred[i].item()}, Actual: {y_train[i].item()}")


Predicted: 12.289767265319824, Actual: 12.24769401550293
Predicted: 12.160289764404297, Actual: 12.109010696411133
Predicted: 12.259357452392578, Actual: 12.317166328430176
Predicted: 11.930134773254395, Actual: 11.849397659301758
Predicted: 12.460906982421875, Actual: 12.429216384887695
Predicted: 11.758111953735352, Actual: 11.870599746704102
Predicted: 12.620908737182617, Actual: 12.634603500366211
Predicted: 12.223823547363281, Actual: 12.206072807312012
Predicted: 11.797599792480469, Actual: 11.774519920349121
Predicted: 11.706730842590332, Actual: 11.67844009399414


In [67]:
model.eval()
with torch.no_grad():
    test_pred = torch.exp(model(X_test))
    submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_pred.cpu().numpy().flatten()})
    submission.to_csv('submission.csv', index=False)