In [1]:
import json
from typing import Optional

import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning import LightningDataModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch import nn
from torch.nn import functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split
from torchmetrics import MeanSquaredError, R2Score

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
sparse_column = ["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope",
                 "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
                 "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual",
                 "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir",
                 "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                 "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"]

consistent_column = ["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd",
                     "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
                     "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath",
                     "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
                     "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea",
                     "MiscVal", "MoSold", "YrSold"]

In [4]:
class MyDataset(Dataset):
    def __init__(self):
        self.csv = pd.read_csv("./data/train.csv")
        numeric = list(self.csv._get_numeric_data().columns)
        for col in numeric:
            if col == 'Id' or col == "SalePrice":
                continue
            self.csv[col] = (self.csv[col] - self.csv[col].mean()) / self.csv[col].std() if (
                    self.csv[col].std() > 0) else 1
        with open("./data/sparse_label.json", "r", encoding="utf-8") as f:
            self.sparse_label = json.loads(f.read())
        with open("./data/sparse_column.json", "r", encoding="utf-8") as f:
            self.sparse_column = json.loads(f.read())

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, index):
        item = self.csv.iloc[index]
        feat = []
        for head in self.csv:
            if head == "Id" or head == "SalePrice":
                continue
            if head in self.sparse_label:
                idx = self.sparse_label[head]["label2idx"][item[head]]
                feat.append(idx)
            else:
                feat.append(item[head])
        return {"feat": torch.tensor(feat), "price": item["SalePrice"]}


def collate_function(data):
    feat = [datum["feat"] for datum in data]
    feat = torch.stack(feat).float()
    price = [datum["price"] for datum in data]
    price = torch.tensor(price).float()
    return {"feat": feat, "price": price}

In [5]:
class MyDataModule(LightningDataModule):
    def __init__(self):
        super().__init__()

    def setup(self, stage: Optional[str] = None) -> None:
        dataset = MyDataset()
        self.train_dataset, self.val_dataset = random_split(dataset, [1260, 200])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, 1024, True, collate_fn=collate_function)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, 1024, False, collate_fn=collate_function)

In [16]:
class LinearModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.lr = 1e-4
        self.conv5 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=8, kernel_size=5, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=5, stride=2),
            nn.ReLU()
        )
        self.conv7 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=7, stride=2),
            nn.ReLU()
        )
        self.conv9 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=8, kernel_size=9, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=9, stride=2),
            nn.ReLU()
        )
        self.fc1 = nn.Linear(1632, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 140)
        self.fc6 = nn.Linear(140, 1)

        self.mse = MeanSquaredError()
        self.r2 = R2Score()

    def forward(self, feat):
        batch_size, _ = feat.size()
        x = feat.unsqueeze(dim=1)
        x5 = F.relu(self.conv5(x))
        x5 = x5.view(batch_size, -1)
        x7 = F.relu(self.conv7(x))
        x7 = x7.view(batch_size, -1)
        x9 = F.relu(self.conv9(x))
        x9 = x9.view(batch_size, -1)
        x = torch.cat((x5, x7, x9), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x)) + x
        x = F.relu(self.fc3(x)) + x
        x = F.relu(self.fc4(x)) + x
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        x = x.squeeze(dim=1)
        return x

    def training_step(self, batch, batch_idx):
        predict = self(batch["feat"])
        loss = F.mse_loss(predict.abs().log10(), batch["price"].log10())
        return {"loss": loss}

    def training_epoch_end(self, outputs):
        total_loss = 0
        for output in outputs:
            total_loss += output["loss"].item()

        self.logger.log_metrics({"train loss": total_loss}, self.current_epoch + 1)

    def validation_step(self, batch, batch_idx):
        predict = self(batch["feat"])
        self.mse(predict, batch["price"])
        self.r2(predict, batch["price"])

    def validation_epoch_end(self, outputs):
        mse = self.mse.compute()
        r2 = self.r2.compute()
        self.logger.log_metrics({"mse": mse.item(), "r2": r2.item()}, self.current_epoch + 1)
        self.mse.reset()
        self.r2.reset()

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

In [17]:
model = LinearModel()

In [20]:
logger = TensorBoardLogger('logs', name='cnn')
checkpoint = ModelCheckpoint(dirpath="./checkpoints/cnn", save_top_k=0)
trainer = pl.Trainer(max_epochs=400,
                     logger=logger,
                     callbacks=[checkpoint],
                     reload_dataloaders_every_n_epochs=5)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(


In [21]:
trainer.fit(model, datamodule=MyDataModule())


   | Name  | Type             | Params
--------------------------------------------
0  | conv5 | Sequential       | 704   
1  | conv7 | Sequential       | 976   
2  | conv9 | Sequential       | 1.2 K 
3  | fc1   | Linear           | 1.7 M 
4  | fc2   | Linear           | 1.0 M 
5  | fc3   | Linear           | 1.0 M 
6  | fc4   | Linear           | 1.0 M 
7  | fc5   | Linear           | 143 K 
8  | fc6   | Linear           | 141   
9  | mse   | MeanSquaredError | 0     
10 | r2    | R2Score          | 0     
--------------------------------------------
5.0 M     Trainable params
0         Non-trainable params
5.0 M     Total params
19.870    Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]

  rank_zero_warn(


Epoch 0:   0%|          | 0/3 [00:00<00:00, 333.33it/s]               

  rank_zero_warn(
  rank_zero_warn(


Epoch 0:  67%|██████▋   | 2/3 [00:02<00:00,  1.31it/s, loss=0.0276, v_num=7]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 3/3 [00:02<00:00,  1.57it/s, loss=0.0276, v_num=7]
Epoch 1:  67%|██████▋   | 2/3 [00:01<00:00,  1.84it/s, loss=0.0271, v_num=7] 
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 1: 100%|██████████| 3/3 [00:01<00:00,  2.10it/s, loss=0.0271, v_num=7]
Epoch 2:  67%|██████▋   | 2/3 [00:01<00:00,  1.91it/s, loss=0.0272, v_num=7]  
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 2: 100%|██████████| 3/3 [00:01<00:00,  2.21it/s, loss=0.0272, v_num=7]
Epoch 3:  67%|██████▋   | 2/3 [00:01<00:00,  1.95it/s, loss=0.0269, v_num=7] 
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/1 [00:00<?, ?it/s][A
Epoch 3: 100%|██████████| 3/3 [00:01<00:00,  2.28it/s, loss=0.0269, v_num=7]
Epoch 4:  67%|██████▋   | 2/3 [0

# 测试

In [22]:
class PredictSet(Dataset):
    def __init__(self):
        self.csv = pd.read_csv("./data/test.csv")
        numeric = list(self.csv._get_numeric_data().columns)
        for col in numeric:
            if col == 'Id' or col == "SalePrice":
                continue
            self.csv[col] = (self.csv[col] - self.csv[col].mean()) / self.csv[col].std() if (
                    self.csv[col].std() > 0) else 1
        with open("./data/sparse_label.json", "r", encoding="utf-8") as f:
            self.sparse_label = json.loads(f.read())
        with open("./data/sparse_column.json", "r", encoding="utf-8") as f:
            self.sparse_column = json.loads(f.read())

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, index):
        item = self.csv.iloc[index]
        feat = []
        for head in self.csv:
            if head == "Id":
                continue
            if head in self.sparse_label:
                idx = self.sparse_label[head]["label2idx"][item[head]]
                feat.append(idx)
            else:
                feat.append(item[head])
        return {"feat": torch.tensor(feat), "idx": item["Id"]}


def collate_fn(data):
    feat = [datum["feat"] for datum in data]
    feat = torch.stack(feat).float()
    idx = [datum["idx"] for datum in data]
    return {"feat": feat, "id": idx}

In [23]:
predict_set = PredictSet()
predict_loader = DataLoader(predict_set, 256, False, collate_fn=collate_fn)

In [24]:
model.to(device)

LinearModel(
  (conv5): Sequential(
    (0): Conv1d(1, 8, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(8, 16, kernel_size=(5,), stride=(2,))
    (3): ReLU()
  )
  (conv7): Sequential(
    (0): Conv1d(1, 8, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(8, 16, kernel_size=(7,), stride=(2,))
    (3): ReLU()
  )
  (conv9): Sequential(
    (0): Conv1d(1, 8, kernel_size=(9,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(8, 16, kernel_size=(9,), stride=(2,))
    (3): ReLU()
  )
  (fc1): Linear(in_features=1632, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc5): Linear(in_features=1024, out_features=140, bias=True)
  (fc6): Linear(in_features=140, out_features=1, bias=True)
  (mse): MeanSquaredError()
  (r2): R2Score()
)

In [25]:
test_result = []
with torch.no_grad():
    for batch in predict_loader:
        predict = model(batch["feat"].to(device))
        for price, id in zip(predict, batch["id"]):
            test_result.append({"Id": id, "SalePrice": price.item()})

In [26]:
result_df = pd.DataFrame(test_result)

In [27]:
result_df.to_csv("./data/result/result24.csv")