In [27]:
import pathlib
import shutil
import tempfile

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import regularizers

import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)


In [20]:
# importing preprocessed data, ready for ML
X_train = pd.read_csv("data/X_train.csv", index_col="Id")
X_valid = pd.read_csv("data/X_valid.csv", index_col="Id")
X_test = pd.read_csv("data/X_test.csv", index_col="Id")

y_train = pd.read_csv("data/y_train.csv", index_col="Id")
y_valid = pd.read_csv("data/y_valid.csv", index_col="Id")


In [21]:
# change the "SalePrice" to its log since that's what matters in the competition
y_train["SalePrice"] = np.log(y_train["SalePrice"] + 1.)
y_valid["SalePrice"] = np.log(y_valid["SalePrice"] + 1.)

y_valid.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
493,12.059809
66,12.66666
890,11.915058
1174,12.208575
1207,11.580593


In [22]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid.values, y_valid.values))

In [13]:
# for feat, targ in train_dataset.take(5):
#   print ('Features: {}, Target: {}'.format(feat, targ))

Features: [ 3.          3.          0.          0.          0.          3.
  0.          1.          2.          0.         77.          6.
  2.         -0.8149287  -1.04839648 -1.04746864 -0.9433882  -1.03410302
 -0.58765809 -0.79879899 -5.60605783 -1.31986297 -0.87635588 -0.10290161
 -0.96438507 -1.07476831 -1.77713056  0.          0.          0.
  1.          0.        ], Target: [72500]
Features: [ 4.00000000e+00  3.00000000e+00  3.00000000e+00  2.00000000e+00
  1.00000000e+00  3.00000000e+00  4.00000000e+00  1.00000000e+00
  2.00000000e+00  0.00000000e+00  6.60000000e+01  4.00000000e+00
  3.00000000e+00  1.13156766e+00 -1.04839648e+00 -1.04746864e+00
  6.26237749e-01 -1.03410302e+00 -1.57844063e-02 -7.98798988e-01
 -1.96675345e-02 -9.31739465e-01 -8.76355884e-01 -1.00505618e-01
 -9.64385071e-01 -1.07476831e+00 -2.81848342e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00], Target: [100000]
Features: [ 10.           4.           5.           3.   

In [23]:
FEATURES = 32
N_TRAIN = 1168
BUFFER_SIZE = int(1e4)
BATCH_SIZE = 200
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

# enable caching
train_dataset = train_dataset.cache()
valid_dataset = valid_dataset.cache()

In [24]:
# set Batch size
valid_dataset.batch(BATCH_SIZE)
train_dataset.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE)

<BatchDataset shapes: ((None, 32), (None, 1)), types: (tf.float64, tf.float64)>

In [25]:
# training procedure

lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
  0.001,
  decay_steps=STEPS_PER_EPOCH*1000,
  decay_rate=1,
  staircase=False)

def get_optimizer():
  return tf.keras.optimizers.Adam(lr_schedule)

In [28]:
def get_callbacks(name):
  return [
    tfdocs.modeling.EpochDots(),
    tf.keras.callbacks.EarlyStopping(monitor='val_binary_crossentropy', patience=200),
    tf.keras.callbacks.TensorBoard(logdir/name),
  ]