In [None]:
from transformers import BertConfig, BertModel, AdamW
import json
from typing import Optional

import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning import LightningDataModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchmetrics import MeanSquaredError, R2Score

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
sparse_column = ["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope",
                 "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
                 "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual",
                 "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir",
                 "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                 "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"]

consistent_column = ["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd",
                     "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
                     "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath",
                     "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
                     "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea",
                     "MiscVal", "MoSold", "YrSold"]

In [None]:
class MyDataset(Dataset):
    def __init__(self):
        self.csv = pd.read_csv("./data/train.csv")
        with open("./data/sparse_label.json", "r", encoding="utf-8") as f:
            self.sparse_label = json.loads(f.read())
        with open("./data/sparse_column.json", "r", encoding="utf-8") as f:
            self.sparse_column = json.loads(f.read())

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, index):
        item = self.csv.iloc[index]
        feat = []
        for head in self.csv:
            if head == "Id" or head == "SalePrice":
                continue
            if head in self.sparse_label:
                idx = self.sparse_label[head]["label2idx"][item[head]]
                feat.append(idx)
            else:
                feat.append(item[head])
        return {"feat": torch.tensor(feat), "price": item["SalePrice"]}


def collate_function(data):
    feat = [datum["feat"] for datum in data]
    feat = torch.stack(feat).float()
    price = [datum["price"] for datum in data]
    price = torch.tensor(price).float()
    return {"feat": feat, "price": price}

In [None]:
class MyDataModule(LightningDataModule):
    def __init__(self):
        super().__init__()

    def setup(self, stage: Optional[str] = None) -> None:
        self.dataset = MyDataset()
        self.train_dataset, self.val_dataset = random_split(self.dataset, [1260, 200])

    def train_dataloader(self):
        return DataLoader(self.dataset, 256, True, collate_fn=collate_function)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, 256, False, collate_fn=collate_function)

In [None]:
class AttentionModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.conv5 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=8, kernel_size=5, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=5, stride=2),
            nn.ReLU()
        )
        self.conv7 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=7, stride=2),
            nn.ReLU()
        )
        self.conv9 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=8, kernel_size=9, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=9, stride=2),
            nn.ReLU()
        )
        config = BertConfig(
            hidden_size=102,
            intermediate_size=256,
            num_attention_heads=2,
            num_hidden_layers=4
        )
        self.bert = BertModel(config)
        self.fc1 = nn.Linear(1632, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 1)

        self.mse = MeanSquaredError()
        self.r2 = R2Score()

    def forward(self, feat):
        x = feat.unsqueeze(dim=1)
        x5 = F.relu(self.conv5(x))
        x7 = F.relu(self.conv7(x))
        x9 = F.relu(self.conv9(x))
        x = torch.cat((x5, x7, x9), dim=2)
        x = self.bert.forward(inputs_embeds=x)["last_hidden_state"]
        batch_size, _, _ = x.size()
        x = x.view(batch_size, -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x)) + x
        x = F.relu(self.fc3(x)) + x
        x = self.fc4(x)
        x = x.squeeze(dim=1)
        return x

    def training_step(self, batch, batch_idx):
        predict = self(batch["feat"])
        loss = F.mse_loss(predict, batch["price"])
        return {"loss": loss}

    def training_epoch_end(self, outputs):
        total_loss = 0
        for output in outputs:
            total_loss += output["loss"].item()

        self.logger.log_metrics({"train loss": total_loss}, self.current_epoch + 1)

    def validation_step(self, batch, batch_idx):
        predict = self(batch["feat"])
        self.mse(predict, batch["price"])
        self.r2(predict, batch["price"])

    def validation_epoch_end(self, outputs):
        mse = self.mse.compute()
        r2 = self.r2.compute()
        self.logger.log_metrics({"mse": mse.item(), "r2": r2.item()}, self.current_epoch + 1)
        self.mse.reset()
        self.r2.reset()

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-4)

In [None]:
model = AttentionModel()

In [None]:
logger = TensorBoardLogger('logs', name='bert')
checkpoint = ModelCheckpoint(dirpath="./checkpoints/bert", save_top_k=0)
trainer = pl.Trainer(max_epochs=400,
                     logger=logger,
                     callbacks=[checkpoint],
                     reload_dataloaders_every_n_epochs=5)

In [None]:
trainer.fit(model, datamodule=MyDataModule())

# 测试

In [None]:
class PredictSet(Dataset):
    def __init__(self):
        self.csv = pd.read_csv("./data/test.csv")
        with open("./data/sparse_label.json", "r", encoding="utf-8") as f:
            self.sparse_label = json.loads(f.read())
        with open("./data/sparse_column.json", "r", encoding="utf-8") as f:
            self.sparse_column = json.loads(f.read())

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, index):
        item = self.csv.iloc[index]
        feat = []
        for head in self.csv:
            if head == "Id":
                continue
            if head in self.sparse_label:
                idx = self.sparse_label[head]["label2idx"][item[head]]
                feat.append(idx)
            else:
                feat.append(item[head])
        return {"feat": torch.tensor(feat), "idx": item["Id"]}


def collate_fn(data):
    feat = [datum["feat"] for datum in data]
    feat = torch.stack(feat).float()
    idx = [datum["idx"] for datum in data]
    return {"feat": feat, "id": idx}

In [None]:
predict_set = PredictSet()
predict_loader = DataLoader(predict_set, 256, False, collate_fn=collate_fn)

In [None]:
model.to(device)

In [None]:
test_result = []
with torch.no_grad():
    for batch in predict_loader:
        predict = model(batch["feat"].to(device))
        for price, id in zip(predict, batch["id"]):
            test_result.append({"Id": id, "SalePrice": price.item()})

In [None]:
result_df = pd.DataFrame(test_result)

In [None]:
result_df.to_csv("./data/result/result20.csv")