### Train set pre-treatment:
* take only numerical features
* replace all NaN values with 0

In [None]:
import pandas as pd
trainfull = pd.read_csv('train.csv', index_col=0)
trainfull= trainfull.fillna(0)
trainfull

In [None]:
data = trainfull.select_dtypes(exclude=['object'])

In [None]:
data1 = data.drop(['SalePrice'],axis=1)

### Set transformation:
* normalization
* log-transform

In [None]:
X_train_full = data1.values.astype(float)
y_train_full = data[['SalePrice']].values.astype(float)

In [None]:
from numpy import random as np_random
SEED = 42
np_random.seed(SEED)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full)

Normalize inputs:

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_X = StandardScaler()
scaler_X.fit(X_train)

In [None]:
X_train_scaled = scaler_X.transform(X_train)
X_val_scaled = scaler_X.transform(X_val)

Log-transform outputs:

In [None]:
from numpy import log
y_train_log = log(y_train)
y_val_log = log(y_val)

In [None]:
scaler_Y_log = StandardScaler()
scaler_Y_log.fit(y_train_log)
y_train_log_scaled = scaler_Y_log.transform(y_train_log)
y_val_log_scaled = scaler_Y_log.transform(y_val_log)

In [None]:
X_val_scaled

## Model relationship between `x` and  `y`

In [None]:
from tensorflow import random as tf_random
tf_random.set_seed(SEED)

### Define model:
* Sequential
* One hidden layer with n = n_input
* Regularizers l2

In [None]:
pip install keras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
import keras
from keras import regularizers
from keras.regularizers import l2

n_input = X_train.shape[1]
model = Sequential([
    Input(shape=n_input),
    Dense(n_input, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.01), activation='relu'),
    #Dense(40, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal')
])

In [None]:
model.summary()

## Set up optimizer

Using Adam instead of SGD

In [None]:
from tensorflow.keras.optimizers import Adam

LEARNING_RATE = 0.001
model.compile(loss='mean_absolute_error', optimizer=Adam(lr=LEARNING_RATE))

## Fit model

In [None]:
BATCH_SIZE = 10
EPOCHS = 200
history = model.fit(X_train_scaled, \
                    y_train_log_scaled, \
                    validation_data=(X_val_scaled, y_val_log_scaled), \
                    epochs=EPOCHS, \
                    batch_size=BATCH_SIZE, \
                    verbose=2)

## Review learning curve

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
import pandas as pd

pd.DataFrame(history.history).plot()
plt.grid(True)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

### Problem: overfitting

## Evaluate model

Predict on val set:

In [None]:
pred = model.predict(X_val_scaled)

Wait... what are we predicting again?

In [None]:
y_pred_log_scaled = pred

De-normalize predictions:

In [None]:
y_pred_log = scaler_Y_log.inverse_transform(y_pred_log_scaled)

Go back from log-dollars to dollars:

In [None]:
from numpy import exp
y_pred = exp(y_pred_log)

Compute performance metrics:

In [None]:
from sklearn import metrics
print(metrics.r2_score(y_val, y_pred))

In [None]:
from sklearn import metrics
print(metrics.r2_score(y_val_log, y_pred_log))

In [None]:
from math import sqrt
sqrt(metrics.mean_squared_error(y_val_log, y_pred_log))