In [None]:
import torch
from torch.utils.data import Dataset


class HousepriceDataset(Dataset):
    def __init__(self, data, city, label, zipcodes, train_num):
        super(HousepriceDataset, self).__init__()
        self.data = data
        self.city = city
        self.zipcodes = zipcodes
        self.label = label
        self.num = train_num

    def __len__(self):
        return len(self.label) - self.num

    def __getitem__(self, index):
        """
        data = torch.Tensor(self.data[index])
        city = torch.tensor(self.city[index], dtype=torch.int)
        label = torch.tensor(self.label[index], dtype=torch.long)
        """

        data = torch.Tensor(self.data[index:index + self.num])
        city = torch.tensor(self.city[index:index + self.num], dtype=torch.int)
        zipcodes = torch.tensor(self.zipcodes[index:index + self.num], dtype=torch.int)
        label = torch.tensor(self.label[index:index + self.num], dtype=torch.long)

        return data, city, zipcodes, label

class ValidationDataset(Dataset):
    def __init__(self, data, city, label, train_num):
        super(ValidationDataset, self).__init__()
        self.data = data
        self.city = city
        self.label = label
        self.num = train_num

    def __len__(self):
        return len(self.label) // self.num

    def __getitem__(self, index):
        """
        data = torch.Tensor(self.data[index])
        city = torch.tensor(self.city[index], dtype=torch.int)
        label = torch.tensor(self.label[index], dtype=torch.long)
        """

        data = torch.Tensor(self.data[index * self.num:index * self.num + self.num])
        city = torch.tensor(self.city[index * self.num:index * self.num + self.num], dtype=torch.int)
        label = torch.tensor(self.label[index * self.num:index * self.num + self.num], dtype=torch.long)

        return data, city, label


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Attention(nn.Module):
    def __init__(self, model_dim, hidden_dim, dropout=0.1):
        super(Attention, self).__init__()

        # for simplicity, set Value dimension = hidden_dim
        self.linear = nn.Linear(model_dim, hidden_dim * 3)
        # for numerical stable
        self.temperature = model_dim ** 0.5
        self.dropout = nn.Dropout(dropout)
        # the idea of Attention: input shape = output shape
        self.fc = nn.Linear(hidden_dim, model_dim)

    def forward(self, x):
        """
        :param x: [batch, 100, model_dim]
        :return: new features after attention
        """
        q, k, v = torch.chunk(self.linear(x), chunks=3, dim=-1)  # all in shape [Batch, 100, hidden_dim]
        residual = x
        attn = torch.matmul(q, k.transpose(-2, -1)) / self.temperature
        """
        mask operation waited to be implemented
        """
        attn = self.dropout(F.softmax(attn, dim=-1))
        output = torch.matmul(attn, v)

        output = self.fc(output) + residual
        # print(output.shape)
        output = torch.permute(output, [0, 2, 1])  # in form [Batch, features(channels), sequence_len] to use BatchNorm
        return output


class DenseNet(nn.Module):
    def __init__(self, d_model, embed_dim, hidden_dim, city_num, num_classes=4):
        """
        :param d_model:        original model dimension before embedding for cities
        :param embed_dim:      embedding dim for cities (district)
        :param hidden_dim:     hidden dim we want for Attention operation
        :param city_num:       unique cities in dataset
        """
        super(DenseNet, self).__init__()

        self.hidden = hidden_dim
        self.d_model = d_model
        self.city_embed = nn.Embedding(city_num, embed_dim)

        self.fc1 = nn.Linear(embed_dim + d_model, 2 * hidden_dim)
        self.attn1 = Attention(2 * hidden_dim, 2 * hidden_dim)
        self.bn1 = nn.BatchNorm1d(2 * hidden_dim)

        self.fc2 = nn.Linear(2 * hidden_dim, 3 * hidden_dim)
        self.attn2 = Attention(3 * hidden_dim, 2 * hidden_dim)
        self.bn2 = nn.BatchNorm1d(3 * hidden_dim)

        self.activ = nn.ReLU()

        self.fc4 = nn.Linear(3 * hidden_dim, 2 * hidden_dim)
        self.bn4 = nn.BatchNorm1d(2 * hidden_dim)

        self.mlp_head = nn.Sequential(
            nn.Linear(2 * hidden_dim, hidden_dim),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x, cities):
        """
        :param x:            house data input except cities (zipcode & district are abandoned)      [Batch, num, dim]
        :param cities:       numeric value of original city attribute                               [Batch, num, embed]
        :return:
        """

        """
        1. embed the cities attribute and concat the matrix 
        2. Linear + Attention (knn based) + Activation 
        3. mlp classification head
        """

        city_embedding = self.city_embed(cities)
        features = torch.concat((x, city_embedding), dim=-1)  # [Batch, num, dim + embed]
        f1 = self.fc1(features)
        attn1 = self.attn1(f1)
        # attn1 = torch.permute(f1, [0, 2, 1])
        # print(attn1.shape)
        f1 = self.bn1(attn1)
        f1 = torch.permute(f1, [0, 2, 1])
        f1 = self.activ(f1)

        f2 = self.fc2(f1)
        attn2 = self.attn2(f2)
        # attn2 = torch.permute(f2, [0, 2, 1])
        f2 = self.bn2(attn2)
        f2 = torch.permute(f2, [0, 2, 1])
        f2 = self.activ(f2)

        # flatten = attn2.view(x.shape[0], -1)

        f4 = self.fc4(f2)
        f4 = torch.permute(f4, [0, 2, 1])
        f4 = self.bn4(f4)
        f4 = torch.permute(f4, [0, 2, 1])
        f4 = self.activ(f4)

        output = self.mlp_head(f4)
        return F.softmax(output, dim=-1)



"""
test = DataProcess("./train_data_final", unused_attrs=['district', 'city', 'zip code', 'region',
                                                       'unit price of residence space',
                                                       'exchange rate',
                                                       'unit price of building space', 'total cost',

                                                       ])

l1, l2, l3, l4 = test.getdata(normalize=True)
train_num = int(len(l1) * 0.9)
city_num = len(np.unique(l2))
zip_num = len(np.unique(l3))
train_x, valid_x = l1[:train_num], l1[train_num:]
train_city, valid_city = l2[:train_num], l2[train_num:]
train_zips, valid_zips = l3[:train_num], l3[train_num:]
train_label, valid_label = l4[:train_num], l4[train_num:]

train_dataset = HousepriceDataset_V2(train_x, train_city, train_label, train_zips, 40)
test_dataset = HousepriceDataset_V2(valid_x, valid_city, valid_label, valid_zips, 20)
print(len(train_dataset))
print(len(test_dataset))
train_dataloader = DataLoader(train_dataset, batch_size=20, shuffle=True,
                              )

test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=True,
                             )
model = DenseNet(l1.shape[1], 8, 32, city_num, zip_num)

train(train_dataloader, model, test_dataloader)
"""

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing


class DataProcess:
    def __init__(self, filepath, unused_attrs):
        self.data = None
        self.df = None
        self.cities = None
        self.labels = None
        self.total_cost = None
        self.fp = filepath
        self.unused = unused_attrs

    def read_data(self):
        self.df = pd.read_csv(self.fp)

    def filter(self):
        """
        We don't need attributes like 'District' & 'ZipCode' since they're mostly likely to be redundant,
        in addition to this, they're hard to deal with
        :return:
        """
        self.cities = self.df["city"]
        self.total_cost = self.df['total cost']
        self.df = self.df.drop(columns=self.unused)

    def encode(self, normalize:bool):
        """
        1. for date attribute : extract month as feature
        2. for city attribute : encoded through nn.Embedding, but first we need to convert them to numeric values
        3. for label attribute : follow the rules
        4. Feature Normalization
        :return:
        """

        # for each row in 'date' attribute, split the string and get the second value(as integer) which is month
        if 'date' in self.df.columns:
          self.df['date'] = self.df['date'].apply(lambda x: int(x.split('/')[1]))
          # self.df['Month'] = self.df['date'].apply(lambda x: int(x.split('/')[1]))
          # self.df.drop(columns=['date'])

        le = preprocessing.LabelEncoder()
        self.cities = le.fit_transform(self.cities)

        def classify(totalCost):
            if totalCost < 300000:
                return 1
            elif totalCost < 500000:
                return 2
            elif totalCost < 700000:
                return 3
            else:
                return 4

        self.labels = self.total_cost.apply(lambda x: classify(x))
        self.labels = self.labels.to_numpy(dtype=np.int32)
        self.labels -= 1

        self.data = self.df.to_numpy()
        """
        it may not be as good as we think. because all values are too small
        we may consider that only do normalization for necessary columns???
        """
        if normalize:
            self.data = preprocessing.normalize(self.data, axis=0)

    def getdata(self, normalize:bool):
        """
        :return:  Normalized training data, city attribute which needs to be embedded, labels
        """
        self.read_data()
        self.filter()
        self.encode(normalize)
        return self.data, self.cities, self.labels


In [None]:
import torch.nn as nn
import torch
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dtype = torch.float32

def train(dataloader, model, testloader, train_loss_list, train_acc_list, test_loss_list, test_acc_list):
    model.train()
    model.to(device)
    total_step = len(dataloader)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
    epochs = 1000
    best_acc = 0
    for epoch in range(epochs):
        for i, (x, y, z) in enumerate(dataloader):
            x = x.to(device)
            y = y.to(device)
            # print(y.shape)
            z = z.to(device)
            predict = model(x, y)  # output shape is [Batch, num, classes]
            predict = torch.permute(predict, [0, 2, 1])
            optimizer.zero_grad()
            loss = criterion(predict, z)
            loss.backward()
            optimizer.step()
            if i % 10 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, epochs, i + 1, total_step,
                                                                         loss.item()))
                
                eval(model, dataloader, True)
                _, accuracy = eval(model, testloader, False)
                if accuracy > best_acc:
                  best_acc = accuracy
                  torch.save(model, "./7.pth")
                print("Best Valid Accuracy: {}".format(best_acc))
        train_loss, train_acc = eval(model, dataloader, True)
        test_loss, test_acc = eval(model, testloader, False)
        train_loss_list.append(train_loss)
        train_acc_list.append(train_acc)
        test_loss_list.append(test_loss)
        test_acc_list.append(test_acc)
        # scheduler.step()

def eval(model, testloader, train:bool):
    """
    since test data doesn't have labels at all, directly use train data to measure accuracy
    """
    model.eval();
    totalNum = 0
    correctNum = 0
    criterion = nn.CrossEntropyLoss()
    losses = 0.0
    with torch.no_grad():
      for i, (x, y, z) in enumerate(testloader):
        x = x.to(device)
        y = y.to(device)
        z = z.to(device)
        predict = model(x, y)  # [Batch, sequence_len, classes]
        predicts = torch.permute(predict, [0, 2, 1])
        loss = criterion(predicts, z)
        losses += loss.item()
        '''
        for j in range(x.shape[0]):
          totalNum += x.shape[1]
          batch_pred, batch_gt = predict[j], z[j]
          predicted = torch.argmax(batch_pred, dim=-1)
          sum = torch.sum(predicted == batch_gt)
          correctNum += sum
        '''
        totalNum += x.shape[0] * x.shape[1]
        predict = torch.argmax(predict, dim=-1)
        correctNum += torch.sum(predict == z)
    acc = correctNum / totalNum
    losses /= len(testloader)
    # print(totalNum)
    if train:
      print("Train Accuracy: {}".format(acc))
    else:
      print("Valid Accuracy: {}".format(acc))
    return losses, acc
    

In [None]:
from torch.utils.data import DataLoader
import pandas as pd

test = DataProcess("./train_data_final", unused_attrs=['district', 'city', 'zip code', 'region',
                                                                 'unit price of residence space',
                                                                 'unit price of building space', 'total cost',
                                                                 
                                                                 ])

l1, l2, l3 = test.getdata(normalize=True)
train_num = int(len(l1) * 0.9)
city_num = len(np.unique(l2))
l1 = pd.read_csv("./train_data18.csv").to_numpy()
l1 = l1[:, :-4]
l1 = preprocessing.normalize(l1, axis=0)
train_x, valid_x = l1[:train_num], l1[train_num:]
train_city, valid_city = l2[:train_num], l2[train_num:]
train_label, valid_label = l3[:train_num], l3[train_num:]


"""
因为现在的数据集文件 total cost没补充, 值是nan，所以label均为4！
"""


"""
net = DenseNet(100, 64, 64, 10)

cities = torch.randint(9, size=(10, 200))
inputx = torch.randn((10, 200, 100))
output = net(inputx, cities)
print(output.shape)
"""


train_dataset = ValidationDataset(train_x, train_city, train_label, 40)
test_dataset = ValidationDataset(valid_x, valid_city, valid_label, 20)
train_dataloader = DataLoader(train_dataset, batch_size=20, shuffle=True,
                        )

test_dataloader = DataLoader(test_dataset, batch_size=10, shuffle=True,
                        )
model = DenseNet(l1.shape[1], 8, 32, city_num)


trl, tra, tel, tea = list(), list(), list(), list()
train(train_dataloader, model, test_dataloader, trl, tra, tel, tea)



In [None]:
for i in range(len(tea)):
  tea[i] = float(tea[i])
  tra[i] = float(tra[i])
tea

In [None]:
import csv

with open('attention_mlp_augmentation.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["Train Loss", "Test Loss", "Train Accuracy", "Test Accuracy"])

  for i in range(1000):
    writer.writerow([trl[i], tel[i], tra[i], tea[i]])