# <center> Keras Model on a regression problem : House Prices_Log of the output </center>

## Load the Data

In [None]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler # Used for scaling of data
import matplotlib.pyplot as plt

In [None]:
# Read in train data
trainfull = pd.read_csv('trainfull.csv', index_col=0)

Choose only the numerical features

In [None]:
trainfull = trainfull.select_dtypes(exclude=['object'])

In [None]:
trainfull

## Split Data

Let's do 80/20 %

In [None]:
from numpy import random
SEED = 42
random.seed(SEED)

In [None]:
from sklearn.model_selection import train_test_split
VAL_SIZE = 0.2
train, val = train_test_split(trainfull, test_size=VAL_SIZE)

Fill the missing values

In [None]:
train = train.fillna(0)
val = val.fillna(0)

Remove the outliers

In [None]:
#from sklearn.ensemble import IsolationForest

#clf = IsolationForest(max_samples = 100, random_state = 42)
#clf.fit(train)
#y_noano = clf.predict(train)
#y_noano = pd.DataFrame(y_noano, columns = ['Top'])
#y_noano[y_noano['Top'] == 1].index.values

#train = train.iloc[y_noano[y_noano['Top'] == 1].index.values]
#train.reset_index(drop = True, inplace = True)
#print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
#print("Number of rows without outliers:", train.shape[0])

Define the inputs and the output

In [None]:
col_train = list(train.columns)
col_train.remove('SalePrice')
col_train.remove('MSSubClass')
Features = col_train
X_train = train[Features]
X_val = val[Features]

In [None]:
y_train = train[['SalePrice']].values.astype(float)
y_val = val[['SalePrice']].values.astype(float)

Log and Standardise the data

In [None]:
from numpy import log
y_train_log = log(y_train)
y_val_log = log(y_val)

In [None]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)

In [None]:
scale = StandardScaler()
X_val = scale.fit_transform(X_val)

In [None]:
scale_y_log = StandardScaler()
scale_y_log.fit(y_train_log)
y_train_log_scaled = scale_y_log.transform(y_train_log)
y_val_log_scaled = scale_y_log.transform(y_val_log)

# Create, compile and Fit the Model

Add a regularizer to avoid overfitting  
Best parameter according to deepnet on BigML

In [None]:
#pip install keras

In [None]:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.optimizers import Adam
from keras.layers.advanced_activations import LeakyReLU
from keras import regularizers
from keras.regularizers import l2

seed = 7
np.random.seed(seed)

# Model
model = Sequential()
model.add(Dense(1168, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu', kernel_regularizer=regularizers.l2(0.01),))
#model.add(Dense(128, kernel_initializer='normal', activation='relu', activity_regularizer=l1(0.01)))
#model.add(Dense(64, kernel_initializer='normal', activation='relu', activity_regularizer=l1(0.01)))
model.add(Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.01),))
# Compile model
model.compile(loss='mean_absolute_error', optimizer=Adam(lr=0.005))

history = model.fit(X_train, y_train_log_scaled, validation_data=(X_val, y_val_log_scaled), epochs=200, batch_size=X_train.shape[1])

In [None]:
# Evaluation on the trainset
model.evaluate(X_train, y_train_log_scaled)

In [None]:
model.summary()

## Learning Curve

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.show()

## Predictions on val

In [None]:
y_val_predict = model.predict(X_val)
y_val_predict

In [None]:
y_val_predict_log = scale_y_log.inverse_transform(y_val_predict)

## Compute MAE, RMSE, R²

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
root_mean_squared_error_log = sqrt(mean_squared_error(y_val_log, y_val_predict_log))
print(root_mean_squared_error_log)

In [None]:
from numpy import exp
y_val_predict = exp(y_val_predict_log)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_val, y_val_predict)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_val, y_val_predict)

In [None]:
root_mean_squared_error = sqrt(mean_squared_error(y_val, y_val_predict))
print(root_mean_squared_error)

## Preparation and Prediction on Testset for kaggle

In [None]:
test = pd.read_csv('../HousePrices/test.csv', header=0)
test = test.fillna(0)

In [None]:
id_col = test['Id'].values.tolist()
scale = StandardScaler()
X_test = test[Features]
X_test = scale.fit_transform(X_test)

In [None]:
prediction = model.predict(X_test)
prediction

In [None]:
prediction_log = scale_y_log.inverse_transform(prediction)
prediction_log

In [None]:
prediction = exp(prediction_log)
prediction

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_col
submission['SalePrice'] = prediction
submission.to_csv('prediction_keras_allnum_bigml.csv', index=False)

In [None]:
submission

# Score Kaggle
<center> Optimizer Adam (lr=0,002): </center>  
Dense 64, 128, 64, 1 : 0.15348  
<center> Optimizer Adam (lr=0,005): </center> 
Dense 1168 : 0.14985
