In [None]:
import pandas as pd
import torch
from torch import nn
from common import load_data_arrays

path = (
    "/Users/sunny/Desktop/D2L/data.nosync/house-prices-advanced-regression-techniques/"
)

loss = nn.MSELoss()


In [None]:
def data_preprocess(parent_path: str):
    test_features = pd.read_csv(parent_path + "test.csv").iloc[:, 1:]  # no indicies
    train_data = pd.read_csv(parent_path + "train.csv")
    train_label_tensor = torch.tensor(
        train_data.iloc[:, -1].values.reshape(-1, 1), dtype=torch.float32
    )
    train_features = train_data.iloc[:, 1:-1]  # no indices and labels
    del train_data

    # concatnation is neccessary when doing get_dummies()
    # I am concerned that should we vanish mean together???
    all_features = pd.concat((train_features, test_features))
    del train_features, test_features

    # vanish the mean (=0) so that we can fill na=0 and convert into indicator features
    cols = all_features.dtypes[all_features.dtypes != "object"].index
    all_features[cols] = all_features[cols].apply(lambda x: (x - x.mean()) / (x.std()))
    all_features[cols] = all_features[cols].fillna(0)
    all_features = pd.get_dummies(all_features, dummy_na=True)

    return (
        torch.tensor(
            all_features[: len(train_label_tensor)].values, dtype=torch.float32
        ),
        train_label_tensor,
        torch.tensor(
            all_features[len(train_label_tensor) :].values, dtype=torch.float32
        ),
    )


In [None]:
def train(
    net: nn.Sequential,
    optimizer: torch.optim.Optimizer,
    num_epochs: int,
    X_train: torch.Tensor,
    y_train: torch.Tensor,
    batch_size: int,
):
    net.train()
    for _ in range(num_epochs):
        for X, y in load_data_arrays((X_train, y_train), batch_size):
            optimizer.zero_grad()
            loss(net(X), y).backward()
            optimizer.step()


In [None]:
def get_k_fold_data(k: int, i: int, X: torch.Tensor, y: torch.Tensor):
    fold_size = X.shape[0] // k
    X_train, y_train, X_valid, y_valid = None, None, None, None
    for j in range(k):
        indicies = slice(j * fold_size, (j + 1) * fold_size)  # may lose some data
        X_part, y_part = X[indicies, :], y[indicies]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)

    return X_train, y_train, X_valid, y_valid


In [None]:
def log_rmse(y_hat: torch.Tensor, y: torch.Tensor):
    y_hat_clipped = torch.clamp(y_hat, 1)  # set lower bound for stability
    l = loss(torch.log(y_hat_clipped), torch.log(y))
    return torch.sqrt(l).item()


In [None]:
def k_fold(
    k: int,
    num_epochs: int,
    X: torch.Tensor,
    y: torch.Tensor,
    batch_size: int,
    learning_rate: float,
    wd: float,
):
    train_loss, valid_loss = [], []
    for i in range(k):
        X_train, y_train, X_valid, y_valid = get_k_fold_data(k, i, X, y)
        net = get_net(X.shape[1])
        optimizer = torch.optim.Adam(net.parameters(), learning_rate, weight_decay=wd)
        train(net, optimizer, num_epochs, X_train, y_train, batch_size)

        net.eval()
        with torch.no_grad():
            train_loss.append(log_rmse(net(X_train), y_train))
            valid_loss.append(log_rmse(net(X_valid), y_valid))

        print(
            f"fold {i + 1}: train log rmse {train_loss[-1]:.8f}, "
            f"valid log rmse {valid_loss[-1]:.8f}"
        )

    print(
        f"avg: train log rmse {sum(train_loss) / k:.8f},"
        f"valid log rmse {sum(valid_loss) / k:.8f}"
    )


In [None]:
def get_net(num_in: int):
    return nn.Sequential(nn.Linear(num_in, 1))


In [None]:
X_train, y_train, X_test = data_preprocess(path)


In [None]:
k_fold(5, 100, X_train, y_train, 256, 100, 0)
