## 数据准备

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_file_path = 'data/train.csv'
test_file_path = 'data/test.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [3]:
print(train_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
train_data_clean = train_data.dropna(axis=1)
test_data_clean = test_data.dropna(axis=1)

print(train_data_clean.columns)
print(test_data_clean.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')
Index(['Id', 'MSSubClass', 'LotArea', 'Street', 

In [5]:
house_features = list(train_data_clean.select_dtypes(include=[np.number]).columns.drop(['Id', 'SalePrice']))
house_features = list(set(house_features).intersection(set(test_data_clean.columns)))

print(house_features)
print(len(house_features))

['OverallCond', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'YearRemodAdd', '3SsnPorch', 'TotRmsAbvGrd', 'WoodDeckSF', 'BedroomAbvGr', 'MiscVal', 'PoolArea', 'LotArea', 'MSSubClass', '1stFlrSF', 'GrLivArea', 'OverallQual', 'LowQualFinSF', '2ndFlrSF', 'EnclosedPorch', 'YearBuilt', 'Fireplaces', 'YrSold', 'ScreenPorch', 'OpenPorchSF', 'MoSold']
25


In [6]:
train_df = train_data_clean[house_features + ['SalePrice']]

test_df = test_data[house_features]

In [7]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [8]:
class DataFrameDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        features = torch.tensor(self.df.iloc[idx].values[:-1], dtype=torch.float32)
        label = torch.tensor(self.df.iloc[idx].values[-1], dtype=torch.float32)
        return features, label

In [9]:
train_dataset = DataFrameDataset(train_df)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for X, y in train_dataloader:
  print(f"Shape of X: {X.shape}")
  print(f"Shape of y: {y.shape}")
  break

Shape of X: torch.Size([64, 25])
Shape of y: torch.Size([64])


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
print(f"Using {device} device")

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(house_features), 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
    def forward(self, x):
        price = self.linear_relu_stack(x)
        return price
    
model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=25, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [11]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [12]:
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        y = y.view(-1, 1)
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [13]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 40836907008.000000  [   64/ 1460]
Epoch 2
-------------------------------
loss: 27962978304.000000  [   64/ 1460]
Epoch 3
-------------------------------
loss: 15049316352.000000  [   64/ 1460]
Epoch 4
-------------------------------
loss: 3660291584.000000  [   64/ 1460]
Epoch 5
-------------------------------
loss: 3788603136.000000  [   64/ 1460]
Done!


In [14]:
model.eval()
with torch.no_grad():

    train_X = torch.tensor(train_df.values[:, :-1], dtype=torch.float32).to(device)
    train_y = torch.tensor(train_df.values[:, -1], dtype=torch.float32).to(device)

    model.eval()
    with torch.no_grad():
        train_pred = model(train_X[:10])
        for i in range(10):
            print(f"Predicted: {train_pred[i].item()}, Actual: {train_y[i].item()}")


Predicted: 172667.03125, Actual: 208500.0
Predicted: 168282.1875, Actual: 181500.0
Predicted: 189554.125, Actual: 223500.0
Predicted: 177976.796875, Actual: 140000.0
Predicted: 218853.28125, Actual: 250000.0
Predicted: 195536.4375, Actual: 143000.0
Predicted: 183045.703125, Actual: 307000.0
Predicted: 196992.75, Actual: 200000.0
Predicted: 161078.71875, Actual: 129900.0
Predicted: 149520.875, Actual: 118000.0


In [15]:
model.eval()
with torch.no_grad():
    test_X = torch.tensor(test_df.values, dtype=torch.float32).to(device)
    test_pred = model(test_X)
    submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_pred.cpu().numpy().flatten()})
    submission.to_csv('submission.csv', index=False)

In [16]:
print(type(test_df))

<class 'pandas.core.frame.DataFrame'>


In [17]:
print(test_data[test_data['Id'] == 2121][house_features])

     OverallCond  FullBath  HalfBath  KitchenAbvGr  YearRemodAdd  3SsnPorch  \
660            7         1         0             1          1950          0   

     TotRmsAbvGrd  WoodDeckSF  BedroomAbvGr  MiscVal  ...  OverallQual  \
660             4           0             2        0  ...            4   

     LowQualFinSF  2ndFlrSF  EnclosedPorch  YearBuilt  Fireplaces  YrSold  \
660             0         0              0       1946           0    2008   

     ScreenPorch  OpenPorchSF  MoSold  
660            0            0       4  

[1 rows x 25 columns]
