## 数据准备

In [18]:
import pandas as pd
import numpy as np
import torch

In [19]:
train_file_path = 'data/train.csv'
test_file_path = 'data/test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

## 数据预处理

In [20]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

print(train_data.shape, test_data.shape, all_features.shape)

(1460, 81) (1459, 80) (2919, 79)


In [21]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

print(numeric_features, len(numeric_features))

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object') 36


In [22]:
# 标准化数值特征
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [None]:
all_features = pd.get_dummies(all_features, dummy_na=True).astype('float32')
print(all_features.shape)


(2919, 330)


In [48]:
n_train = train_data.shape[0]
X_train = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
X_test = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
y_train = torch.tensor(train_data.SalePrice.values, dtype=torch.float32).view(-1, 1)
y_train = torch.log(y_train)

print(X_train[:5], y_train[:5])
print(X_train.shape, y_train.shape, X_test.shape)

tensor([[ 0.0673, -0.1844, -0.2178,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8735,  0.4581, -0.0720,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.0673, -0.0559,  0.1372,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.3025, -0.3986, -0.0784,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0673,  0.6294,  0.5188,  ...,  1.0000,  0.0000,  0.0000]]) tensor([[12.2477],
        [12.1090],
        [12.3172],
        [11.8494],
        [12.4292]])
torch.Size([1460, 330]) torch.Size([1460, 1]) torch.Size([1459, 330])


## 建立 torch 的 Dataset 和 DataLoader

In [49]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [50]:
class TrainDataset(Dataset):
    def __init__(self, X, y):
        assert len(X) == len(y)
        self.X, self.y = X, y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [51]:
train_dataset = TrainDataset(X_train, y_train)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for X, y in train_dataloader:
  print(f"Shape of X: {X.shape}")
  print(f"Shape of y: {y.shape}")
  break

Shape of X: torch.Size([64, 330])
Shape of y: torch.Size([64, 1])


## 建立模型

In [93]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
print(f"Using {device} device")

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(all_features.columns), 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )
        
    def forward(self, x):
        price = self.linear_relu_stack(x)
        return price

class LinearRegression(nn.Module):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(len(all_features.columns), 1)
        
    def forward(self, x):
        price = self.linear(x)
        return price
    
# model = NeuralNetwork().to(device)
# print(model)

Using cpu device


In [None]:
# loss_fn = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [64]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
# model.eval()
# with torch.no_grad():
#     test_pred = torch.exp(model(X_test))
#     submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_pred.cpu().numpy().flatten()})
#     submission.to_csv('submission.csv', index=False)

In [81]:
def k_fold_validation(k, X_train, y_train, model, loss_fn, optimizer, epochs):
    n = len(X_train)
    fold_size = n // k
    average_loss = 0
    for i in range(k):
        print(f"Fold {i + 1}")
        X_val = X_train[i * fold_size : (i + 1) * fold_size]
        y_val = y_train[i * fold_size : (i + 1) * fold_size]
        X_train_fold = torch.cat((X_train[: i * fold_size], X_train[(i + 1) * fold_size :]))
        y_train_fold = torch.cat((y_train[: i * fold_size], y_train[(i + 1) * fold_size :]))
        
        train_dataset = TrainDataset(X_train_fold, y_train_fold)
        val_dataset = TrainDataset(X_val, y_val)
        
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
        
        for t in range(epochs):
            print(f"Epoch {t + 1}\n-------------------------------")
            train(train_dataloader, model, loss_fn, optimizer)
        print("Done!")
        
        model.eval()
        with torch.no_grad():
            val_pred = model(X_val)
            loss = loss_fn(val_pred, y_val)
            average_loss += loss.item()
            print(f"Validation loss of fold {i + 1}: {loss.item()}")
    average_loss /= k
    print(f"Average validation loss: {average_loss}")
    return model

In [97]:
def run_and_save(X_train, y_train, model, batch_size, loss_fn, optimizer, epochs, submission_file = 'submission.csv'):
    train_dataset = TrainDataset(X_train, y_train)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
    print("Done!")

    model.eval()
    with torch.no_grad():
        test_pred = torch.exp(model(X_test))
        submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_pred.cpu().numpy().flatten()})
        submission.to_csv(submission_file, index=False)

In [96]:
myMLP = MLP().to(device)
k_fold_validation(2, X_train, y_train, myMLP, nn.MSELoss(), torch.optim.Adam(myMLP.parameters(), lr=1e-3), 30)

Fold 1
Epoch 1
-------------------------------
loss: 144.310913  [   64/  730]
Epoch 2
-------------------------------
loss: 49.860306  [   64/  730]
Epoch 3
-------------------------------
loss: 14.263617  [   64/  730]
Epoch 4
-------------------------------
loss: 4.052580  [   64/  730]
Epoch 5
-------------------------------
loss: 0.641578  [   64/  730]
Epoch 6
-------------------------------
loss: 0.194674  [   64/  730]
Epoch 7
-------------------------------
loss: 0.055496  [   64/  730]
Epoch 8
-------------------------------
loss: 0.040431  [   64/  730]
Epoch 9
-------------------------------
loss: 0.083757  [   64/  730]
Epoch 10
-------------------------------
loss: 0.023521  [   64/  730]
Epoch 11
-------------------------------
loss: 0.021919  [   64/  730]
Epoch 12
-------------------------------
loss: 0.049409  [   64/  730]
Epoch 13
-------------------------------
loss: 0.014869  [   64/  730]
Epoch 14
-------------------------------
loss: 0.021600  [   64/  730]
Epoc

MLP(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=330, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [91]:
myLinear = LinearRegression().to(device)
k_fold_validation(2, X_train, y_train, myLinear, nn.MSELoss(), torch.optim.Adam(myLinear.parameters(), lr=1e-3), 100)

Fold 1
Epoch 1
-------------------------------
loss: 145.260834  [   64/  730]
Epoch 2
-------------------------------
loss: 132.336273  [   64/  730]
Epoch 3
-------------------------------
loss: 123.003494  [   64/  730]
Epoch 4
-------------------------------
loss: 110.017754  [   64/  730]
Epoch 5
-------------------------------
loss: 101.775375  [   64/  730]
Epoch 6
-------------------------------
loss: 92.291176  [   64/  730]
Epoch 7
-------------------------------
loss: 82.063210  [   64/  730]
Epoch 8
-------------------------------
loss: 75.232544  [   64/  730]
Epoch 9
-------------------------------
loss: 65.829102  [   64/  730]
Epoch 10
-------------------------------
loss: 59.407452  [   64/  730]
Epoch 11
-------------------------------
loss: 53.584278  [   64/  730]
Epoch 12
-------------------------------
loss: 47.778667  [   64/  730]
Epoch 13
-------------------------------
loss: 42.749374  [   64/  730]
Epoch 14
-------------------------------
loss: 38.334534  [  

LinearRegression(
  (linear): Linear(in_features=330, out_features=1, bias=True)
)