## Download data
Competition: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

Data description: https://www.fit.vutbr.cz/~ikiss/bissit22/nn/data_description.txt

In [None]:
import os

if not os.path.isfile("data.csv"):
    !wget "https://www.fit.vutbr.cz/~ikiss/bissit22/nn/data.csv"

--2022-07-14 14:42:39--  https://www.fit.vutbr.cz/~ikiss/bissit22/nn/data.csv
Resolving www.fit.vutbr.cz (www.fit.vutbr.cz)... 147.229.9.23, 2001:67c:1220:809::93e5:917
Connecting to www.fit.vutbr.cz (www.fit.vutbr.cz)|147.229.9.23|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460676 (450K) [text/csv]
Saving to: ‘data.csv’


2022-07-14 14:42:44 (159 KB/s) - ‘data.csv’ saved [460676/460676]



## Parse the data

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MinMaxScaler, KBinsDiscretizer, FunctionTransformer

# Read the data from CSV file
data = pd.read_csv("data.csv")
df = pd.DataFrame(data)

# Function to convert quality to number between 0 and 1
# Quality can be 10, 9, 8, ..., 1.
def quality_to_number(quality):
  return quality/10

# Store sale price scaler into variable for later use
sale_price_scaler = MinMaxScaler(feature_range=(-1,1))

# TODO: Year?

# Preprocess data
bedrooms = df.BedroomAbvGr.to_numpy().reshape(-1, 1)
house_style = LabelBinarizer().fit_transform(df.HouseStyle)
central_air = LabelBinarizer().fit_transform(df.CentralAir)  # binary data
area = KBinsDiscretizer(n_bins=10, encode="onehot-dense", strategy="uniform").fit_transform(df.LotArea.to_numpy().reshape(-1, 1))  # discretize area into 10 bins
overall_quality = FunctionTransformer(func=quality_to_number).fit_transform(df.OverallQual.to_numpy().reshape(-1, 1))
sale_price = sale_price_scaler.fit_transform(df.SalePrice.to_numpy().reshape(-1, 1))  # scale to range [-1;1]

# Merge input data into a single matrix
inputs = np.concatenate((bedrooms, house_style, central_air, area, overall_quality), axis=1)
targets = sale_price

## Create datasets

In [None]:
import torch

# Initialize the seed with a constant to obtain the same result for every run
np.random.seed(42)

# Randomly shuffle indices
indices = np.arange(inputs.shape[0])
np.random.shuffle(indices)

# Select train and test indices
train_indices = indices[:1200]
test_indices = indices[1200:]

# Split inputs and targets into train and test subsets
train_inputs = torch.from_numpy(inputs[train_indices]).float()
train_targets = torch.from_numpy(targets[train_indices]).float()
test_inputs = torch.from_numpy(inputs[test_indices]).float()
test_targets = torch.from_numpy(targets[test_indices]).float()

# Create datasets
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_targets)
test_dataset = torch.utils.data.TensorDataset(test_inputs, test_targets)

## Define network

In [None]:
# Define network as a sequence of layers
import torch
def create_network():
	network = torch.nn.Sequential(
      torch.nn.Linear(21, 128),
      torch.nn.ReLU(),
      torch.nn.Linear(128, 1)
  )			

	return network

# Create new model and print its structure
model = create_network()
print(model)

Sequential(
  (0): Linear(in_features=21, out_features=128, bias=True)
  (1): ReLU()
  (2): Linear(in_features=128, out_features=1, bias=True)
)


## Save and load model

In [None]:
import datetime


# Creates folders if they don't exist
def make_dirs_if_not_exist(path):
  if not os.path.isdir(path):
    os.makedirs(path)


# Saves model to path
def save_model(model, path):
  torch.save(model.state_dict(), path)


# Saves model with timestamp suffix to folder
def save_model_timestamp(model, folder="checkpoints"):
  make_dirs_if_not_exist(folder)
  model_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  save_model(model, os.path.join(folder, f"model_{model_time}.pt"))


# Loads model from path
def load_model(path):
  model = create_network()
  model.load_state_dict(torch.load(path))
  return model


# Loads last model from folder
def load_last_model(folder="checkpoints"):
  if not os.path.isdir(folder):
    return None

  models = [file for file in os.listdir(folder) if file.startswith("model_") and file.endswith(".pt")]
  models = sorted(models)
  last_model = models[-1]
  return load_model(os.path.join(folder, last_model))

## Training setup

In [None]:
# Training setup
batch_size = 64
view_step = 1000
iterations = 10000

# Create training and testing DataLoaders
training_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)
testing_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=True, pin_memory=True, num_workers=0)  

# Define loss function and optimizer
criterion = torch.nn.MSELoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Training step

In [None]:
def training_step(model, input_data, target_labels, criterion, optimizer):
	# Forward pass - compute network autput and store all activations
	outputs = model(input_data)
	
	# Compute loss
	loss = criterion(outputs, target_labels).mean()
   
	# Backward pass - compute gradients
	optimizer.zero_grad()
	loss.backward()

	# Optimize network
	optimizer.step()

 	# .item() and .detach() disconects from the computational graph
	return loss.item(), outputs.detach()

## Testing the model

In [None]:
def test(model, data_loader, criterion):
  model = model.eval()
	
  # Accumulators
  loss_acc = 0
  price_diff_acc = 0
  counter = 0
	
  # Loop through dataset
  for batch_data, batch_labels in data_loader:
    # Calculate output
    output = model(batch_data)
  
    # Accumulate loss value
    loss_acc += criterion(output, batch_labels).sum().item()
  
    # Accumulate diff between target and predicted prices
    target_price = sale_price_scaler.inverse_transform(batch_labels.detach().numpy())
    predicted_price = sale_price_scaler.inverse_transform(output.detach().numpy())
    price_diff_acc += np.abs(predicted_price - target_price).sum()

    # Accumulate batch size, the last batch might not be the same size as the others therefore we accumulate real size of the data
    counter += batch_data.shape[0]
 
  model = model.train()
  return loss_acc / counter, price_diff_acc / counter

## Training

In [None]:
# Accumulators
train_loss_acc = 0
train_price_diff_acc = 0
iteration = 0
stop_training = False

# When we reach the end of the dataset, but do not want to end the training, we loop through it again
while not stop_training:
  # Obtain batch from the training dataset in a loop
  for batch_data, batch_labels in training_loader:
    iteration += 1

    # Do a training step
    loss, outputs = training_step(model, batch_data, batch_labels, criterion, optimizer)
    
    # Accumulate loss for statistics
    train_loss_acc += loss

    # Calculate diff between target and predicted prices
    target_price = sale_price_scaler.inverse_transform(batch_labels.detach().numpy())
    predicted_price = sale_price_scaler.inverse_transform(outputs.detach().numpy())
    train_price_diff_acc += np.abs(predicted_price - target_price).mean()

    # Test model 
    if iteration % view_step == 0:
      # Calculate loss and accuracy on the testing dataset
      test_loss_acc, test_price_avg_diff = test(model, testing_loader, criterion)

      print(f"iteration:{iteration} "
            f"train_loss:{train_loss_acc/view_step:.3f} "
            f"test_loss:{test_loss_acc:.3f} "
            f"train_price_avg_diff:{train_price_diff_acc/view_step:.3f} "
            f"test_price_avg_diff:{test_price_avg_diff:.3f}")
      
      # Reset the accumulators
      train_loss_acc = 0
      train_price_diff_acc = 0

    # Stop training when amount of iterations is reached
    if iteration >= iterations:
      stop_training = True
      break

print("Training finished.")
save_model_timestamp(model)

iteration:1000 train_loss:0.022 test_loss:0.015 train_price_avg_diff:35371.872 test_price_avg_diff:32950.924
iteration:2000 train_loss:0.014 test_loss:0.014 train_price_avg_diff:29319.927 test_price_avg_diff:30586.674
iteration:3000 train_loss:0.013 test_loss:0.014 train_price_avg_diff:28044.886 test_price_avg_diff:29877.316
iteration:4000 train_loss:0.012 test_loss:0.014 train_price_avg_diff:27499.168 test_price_avg_diff:29324.996
iteration:5000 train_loss:0.012 test_loss:0.014 train_price_avg_diff:26893.395 test_price_avg_diff:30534.263
iteration:6000 train_loss:0.012 test_loss:0.014 train_price_avg_diff:26631.117 test_price_avg_diff:29851.211
iteration:7000 train_loss:0.011 test_loss:0.016 train_price_avg_diff:26462.967 test_price_avg_diff:33430.770
iteration:8000 train_loss:0.011 test_loss:0.015 train_price_avg_diff:26307.376 test_price_avg_diff:31603.045
iteration:9000 train_loss:0.011 test_loss:0.015 train_price_avg_diff:26353.930 test_price_avg_diff:30495.985
iteration:10000 tra