## 数据准备

In [18]:
import pandas as pd
import numpy as np
import torch

In [19]:
train_file_path = 'data/train.csv'
test_file_path = 'data/test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

## 数据预处理

In [20]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

print(train_data.shape, test_data.shape, all_features.shape)

(1460, 81) (1459, 80) (2919, 79)


In [21]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

print(numeric_features, len(numeric_features))

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object') 36


In [22]:
# 标准化数值特征
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [None]:
all_features = pd.get_dummies(all_features, dummy_na=True).astype('float32')
print(all_features.shape)


(2919, 330)


In [48]:
n_train = train_data.shape[0]
X_train = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
X_test = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
y_train = torch.tensor(train_data.SalePrice.values, dtype=torch.float32).view(-1, 1)
y_train = torch.log(y_train)

print(X_train[:5], y_train[:5])
print(X_train.shape, y_train.shape, X_test.shape)

tensor([[ 0.0673, -0.1844, -0.2178,  ...,  1.0000,  0.0000,  0.0000],
        [-0.8735,  0.4581, -0.0720,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.0673, -0.0559,  0.1372,  ...,  1.0000,  0.0000,  0.0000],
        [ 0.3025, -0.3986, -0.0784,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0673,  0.6294,  0.5188,  ...,  1.0000,  0.0000,  0.0000]]) tensor([[12.2477],
        [12.1090],
        [12.3172],
        [11.8494],
        [12.4292]])
torch.Size([1460, 330]) torch.Size([1460, 1]) torch.Size([1459, 330])


## 建立 torch 的 Dataset 和 DataLoader

In [49]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [50]:
class TrainDataset(Dataset):
    def __init__(self, X, y):
        assert len(X) == len(y)
        self.X, self.y = X, y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [51]:
train_dataset = TrainDataset(X_train, y_train)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for X, y in train_dataloader:
  print(f"Shape of X: {X.shape}")
  print(f"Shape of y: {y.shape}")
  break

Shape of X: torch.Size([64, 330])
Shape of y: torch.Size([64, 1])


## 建立模型

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
print(f"Using {device} device")

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        # self.linear_relu_stack = nn.Sequential(
        #     nn.Linear(len(all_features.columns), 256),
        #     nn.ReLU(),
        #     nn.Linear(256, 256),
        #     nn.ReLU(),
        #     nn.Linear(256, 1)
        # )
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(X_train.shape[1], 1),
        )
        
    def forward(self, x):
        price = self.linear_relu_stack(x)
        return price
    
model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=330, out_features=1, bias=True)
  )
)


In [53]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [55]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [58]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 3.977460  [   64/ 1460]
Epoch 2
-------------------------------
loss: 2.707125  [   64/ 1460]
Epoch 3
-------------------------------
loss: 2.028079  [   64/ 1460]
Epoch 4
-------------------------------
loss: 1.411597  [   64/ 1460]
Epoch 5
-------------------------------
loss: 1.077563  [   64/ 1460]
Epoch 6
-------------------------------
loss: 0.618348  [   64/ 1460]
Epoch 7
-------------------------------
loss: 0.531889  [   64/ 1460]
Epoch 8
-------------------------------
loss: 0.313209  [   64/ 1460]
Epoch 9
-------------------------------
loss: 0.296743  [   64/ 1460]
Epoch 10
-------------------------------
loss: 0.160741  [   64/ 1460]
Epoch 11
-------------------------------
loss: 0.124895  [   64/ 1460]
Epoch 12
-------------------------------
loss: 0.138639  [   64/ 1460]
Epoch 13
-------------------------------
loss: 0.086676  [   64/ 1460]
Epoch 14
-------------------------------
loss: 0.083841  [   64/ 1460]
Epoch 15
------

In [59]:
model.eval()
with torch.no_grad():
    train_pred = model(X_train[:10])
    for i in range(10):
        print(f"Predicted: {train_pred[i].item()}, Actual: {y_train[i].item()}")


Predicted: 12.276256561279297, Actual: 12.24769401550293
Predicted: 12.017802238464355, Actual: 12.109010696411133
Predicted: 12.324498176574707, Actual: 12.317166328430176
Predicted: 12.128534317016602, Actual: 11.849397659301758
Predicted: 12.594012260437012, Actual: 12.429216384887695
Predicted: 11.935307502746582, Actual: 11.870599746704102
Predicted: 12.52228832244873, Actual: 12.634603500366211
Predicted: 12.100232124328613, Actual: 12.206072807312012
Predicted: 11.830005645751953, Actual: 11.774519920349121
Predicted: 11.4766845703125, Actual: 11.67844009399414


In [60]:
model.eval()
with torch.no_grad():
    test_pred = torch.exp(model(X_test))
    submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_pred.cpu().numpy().flatten()})
    submission.to_csv('submission.csv', index=False)

In [16]:
print(type(test_df))

<class 'pandas.core.frame.DataFrame'>


In [17]:
print(test_data[test_data['Id'] == 2121][house_features])

     OverallCond  FullBath  HalfBath  KitchenAbvGr  YearRemodAdd  3SsnPorch  \
660            7         1         0             1          1950          0   

     TotRmsAbvGrd  WoodDeckSF  BedroomAbvGr  MiscVal  ...  OverallQual  \
660             4           0             2        0  ...            4   

     LowQualFinSF  2ndFlrSF  EnclosedPorch  YearBuilt  Fireplaces  YrSold  \
660             0         0              0       1946           0    2008   

     ScreenPorch  OpenPorchSF  MoSold  
660            0            0       4  

[1 rows x 25 columns]
