# <center> Keras model on a regression problem: House Prices _ 10 features </center>

## Import Data

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler # Used for scaling of data
from keras import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K

In [None]:
# Read in train data
train = pd.read_csv('../HousePrices/train.csv', index_col=0)
val = pd.read_csv('../HousePrices/val.csv', index_col=0)

## Remove outliers
Outliers
In this small part we will isolate the outliers with an IsolationForest (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html). 

In [None]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(train)
y_noano = clf.predict(train)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

train = train.iloc[y_noano[y_noano['Top'] == 1].index.values]
train.reset_index(drop = True, inplace = True)
print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", train.shape[0])

## Verify correlation between features

In [None]:
#correlation matrix
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

Just with the 10 best

In [None]:
import numpy as np
#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

- GarageCars and GarageArea are two important features but we drop GarageArea since it is more or less the same information as GarageCars
- TotalBsmtSF and 1stFlrSF are also more or less the same so we drop 1StFlrSF
- TotRmsAbvGrd and GrLivArea are also strongly correlated to let's drop TotRmsAbvGrd

Add a feature surface

In [None]:
train['Surface'] = train['1stFlrSF']+train['2ndFlrSF']+train['TotalBsmtSF']

In [None]:
val['Surface'] = val['1stFlrSF']+val['2ndFlrSF']+val['TotalBsmtSF']

## Standardise the data

In [None]:
scale = StandardScaler()
X_train = train[['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'LotArea','Surface', 'BsmtUnfSF']]
X_train = scale.fit_transform(X_train)

In [None]:
scale = StandardScaler()
X_val = val[['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'LotArea','Surface', 'BsmtUnfSF']]
X_val = scale.fit_transform(X_val)

Define the output

In [None]:
# Y is just the 'SalePrice' column
y_train = train['SalePrice'].values
y_val = val['SalePrice'].values

## Define, compile and fit the Model

In [None]:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.optimizers import SGD, Adadelta, Adam

seed = 7
np.random.seed(seed)

# Model
model = Sequential()
model.add(Dense(200, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(50, kernel_initializer='normal', activation='relu'))
model.add(Dense(25, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_absolute_error', optimizer=Adadelta())
# model.compile(loss='mean_absolute_error', optimizer=SGD(lr=0.001))
# model.compile(loss='mean_absolute_error', optimizer=Adam(lr=0.001))

history = model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=150, batch_size=10)

In [None]:
# Evaluation on the test set created by train_test_split
model.evaluate(X_train, y_train)

In [None]:
model.summary()

## Learning Curve

In [None]:
import pandas as pd
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
#plt.gca().set_ylim(0, 1)
plt.title('Model performance throughout training')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.show()

## Prediction on Val

In [None]:
y_val_predict = model.predict(X_val)
y_val_predict

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_val, y_val_predict)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
root_mean_squared_error = sqrt(mean_squared_error(y_val, y_val_predict))
print(root_mean_squared_error)

## Preparation of test set and Predictions for kaggle

In [None]:
test = pd.read_csv('../HousePrices/test.csv', header=0)
test = test.fillna(0)

In [None]:
test['Surface'] = test['1stFlrSF']+test['2ndFlrSF']+test['TotalBsmtSF']

In [None]:
id_col = test['Id'].values.tolist()
scale = StandardScaler()
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars', 'FullBath', 'YearBuilt', 'LotArea','Surface', 'BsmtUnfSF']]
X_test = scale.fit_transform(X_test)

In [None]:
prediction = model.predict(X_test)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_col
submission['SalePrice'] = prediction
submission.to_csv('prediction_keras_10features.csv', index=False)

# Score Kaggle
<center> Optimizer Adam: </center>  
Dense 200, 100, 50, 25, 1 : 0.17758  
<center> Optimizer Adadelta: </center>  
Dense 200, 100, 50, 25, 1 : 0.17197,  
Dense 8, 8, 8, 8, 1 : 0.18180,
Dense 8, 16, 32, 8, 1 : 0.18116