## Outline
* Fit a neural net to the sberbank housing data
* Clean and process the data
* Attempted fits: shallow network, deep network, PCA
* Hyperparameters: Number of nodes, number of layers, activation functions, batch normalization, weight initializers, dropout

### Load and clean data

In [75]:
# Parameters
prediction_stderr = 0.0073  #  assumed standard error of predictions
                          #  (smaller values make output closer to input)
train_test_logmean_diff = 0.1  # assumed shift used to adjust frequencies for time trend
probthresh = 90  # minimum probability*frequency to use new price instead of just rounding
rounder = 2  # number of places left of decimal point to zero

import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
import xgboost as xgb

import datetime
from scipy.stats import norm
    
#load files
train = pd.read_csv('train/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test/test.csv', parse_dates=['timestamp'])
id_test = test.id
#df_test = pd.read_csv('test/test.csv')

#clean data
print('Data Clean...')
bad_index = train[train.life_sq > train.full_sq].index
train.loc[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.loc[equal_index, "life_sq"] = test.loc[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.loc[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.loc[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.loc[kitch_is_build_year, "build_year"] = train.loc[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.loc[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.loc[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.loc[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index
train.loc[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index
test.loc[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.loc[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.loc[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.loc[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.loc[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.loc[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.loc[bad_index, "max_floor"] = np.NaN
train.floor.describe(percentiles= [0.9999])
bad_index = [23584]
train.loc[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.loc[bad_index, "state"] = np.NaN
test.state.value_counts()

# brings error down a lot by removing extreme price per sqm
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]

print('Feature Engineering...')
# Add month-year
month_year = (train.timestamp.dt.month*30 + train.timestamp.dt.year * 365)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test.timestamp.dt.month*30 + test.timestamp.dt.year * 365)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (train.timestamp.dt.weekofyear*7 + train.timestamp.dt.year * 365)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test.timestamp.dt.weekofyear*7 + test.timestamp.dt.year * 365)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek

test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek

# Other feature engineering
train['rel_floor'] = 0.05+train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = 0.05+train['kitch_sq'] / train['full_sq'].astype(float)

test['rel_floor'] = 0.05+test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = 0.05+test['kitch_sq'] / test['full_sq'].astype(float)

train.apartment_name=train.sub_area + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area + train['metro_km_avto'].astype(str)

train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

train['area_per_room'] = train['life_sq'] / train['num_room'].astype(float) #rough area per room
train['livArea_ratio'] = train['life_sq'] / train['full_sq'].astype(float) #rough living area
train['yrs_old'] = 2017 - train['build_year'].astype(float) #years old from 2017
train['avgfloor_sq'] = train['life_sq']/train['max_floor'].astype(float) #living area per floor
train['pts_floor_ratio'] = train['public_transport_station_km']/train['max_floor'].astype(float)
# looking for significance of apartment buildings near public t 
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
# doubled a var by accident
# when removing one score did not improve...
train['gender_ratio'] = train['male_f']/train['female_f'].astype(float)
train['kg_park_ratio'] = train['kindergarten_km']/train['park_km'].astype(float) #significance of children?
train['high_ed_extent'] = train['school_km'] / train['kindergarten_km'] #schooling
train['pts_x_state'] = train['public_transport_station_km'] * train['state'].astype(float) #public trans * state of listing
train['lifesq_x_state'] = train['life_sq'] * train['state'].astype(float) #life_sq times the state of the place
train['floor_x_state'] = train['floor'] * train['state'].astype(float) #relative floor * the state of the place

test['area_per_room'] = test['life_sq'] / test['num_room'].astype(float)
test['livArea_ratio'] = test['life_sq'] / test['full_sq'].astype(float)
test['yrs_old'] = 2017 - test['build_year'].astype(float)
test['avgfloor_sq'] = test['life_sq']/test['max_floor'].astype(float) #living area per floor
test['pts_floor_ratio'] = test['public_transport_station_km']/test['max_floor'].astype(float) #apartments near public t?
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)
test['gender_ratio'] = test['male_f']/test['female_f'].astype(float)
test['kg_park_ratio'] = test['kindergarten_km']/test['park_km'].astype(float)
test['high_ed_extent'] = test['school_km'] / test['kindergarten_km']
test['pts_x_state'] = test['public_transport_station_km'] * test['state'].astype(float) #public trans * state of listing
test['lifesq_x_state'] = test['life_sq'] * test['state'].astype(float)
test['floor_x_state'] = test['floor'] * test['state'].astype(float)

#########################################################################
print('Rate Mults...')
# Aggreagte house price data derived from 
# http://www.globalpropertyguide.com/real-estate-house-prices/R#russia
# by luckyzhou
# See https://www.kaggle.com/luckyzhou/lzhou-test/comments

rate_2015_q2 = 1
rate_2015_q1 = rate_2015_q2 / 0.9932
rate_2014_q4 = rate_2015_q1 / 1.0112
rate_2014_q3 = rate_2014_q4 / 1.0169
rate_2014_q2 = rate_2014_q3 / 1.0086
rate_2014_q1 = rate_2014_q2 / 1.0126
rate_2013_q4 = rate_2014_q1 / 0.9902
rate_2013_q3 = rate_2013_q4 / 1.0041
rate_2013_q2 = rate_2013_q3 / 1.0044
rate_2013_q1 = rate_2013_q2 / 1.0104  # This is 1.002 (relative to mult), close to 1:
rate_2012_q4 = rate_2013_q1 / 0.9832  #     maybe use 2013q1 as a base quarter and get rid of mult?
rate_2012_q3 = rate_2012_q4 / 1.0277
rate_2012_q2 = rate_2012_q3 / 1.0279
rate_2012_q1 = rate_2012_q2 / 1.0279
rate_2011_q4 = rate_2012_q1 / 1.076
rate_2011_q3 = rate_2011_q4 / 1.0236
rate_2011_q2 = rate_2011_q3 / 1
rate_2011_q1 = rate_2011_q2 / 1.011


# train 2015
train['average_q_price'] = 1

train_2015_q2_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2015_q2_index, 'average_q_price'] = rate_2015_q2

train_2015_q1_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2015_q1_index, 'average_q_price'] = rate_2015_q1


# train 2014
train_2014_q4_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2014_q4_index, 'average_q_price'] = rate_2014_q4

train_2014_q3_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2014_q3_index, 'average_q_price'] = rate_2014_q3

train_2014_q2_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2014_q2_index, 'average_q_price'] = rate_2014_q2

train_2014_q1_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2014_q1_index, 'average_q_price'] = rate_2014_q1


# train 2013
train_2013_q4_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2013_q4_index, 'average_q_price'] = rate_2013_q4

train_2013_q3_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2013_q3_index, 'average_q_price'] = rate_2013_q3

train_2013_q2_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2013_q2_index, 'average_q_price'] = rate_2013_q2

train_2013_q1_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2013_q1_index, 'average_q_price'] = rate_2013_q1


# train 2012
train_2012_q4_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2012_q4_index, 'average_q_price'] = rate_2012_q4

train_2012_q3_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2012_q3_index, 'average_q_price'] = rate_2012_q3

train_2012_q2_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2012_q2_index, 'average_q_price'] = rate_2012_q2

train_2012_q1_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2012_q1_index, 'average_q_price'] = rate_2012_q1


# train 2011
train_2011_q4_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2011_q4_index, 'average_q_price'] = rate_2011_q4

train_2011_q3_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2011_q3_index, 'average_q_price'] = rate_2011_q3

train_2011_q2_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2011_q2_index, 'average_q_price'] = rate_2011_q2

train_2011_q1_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2011_q1_index, 'average_q_price'] = rate_2011_q1

train['price_doc'] = train['price_doc'] * train['average_q_price']


#########################################################################################################

mult = 1.054880504
train['price_doc'] = train['price_doc'] * mult
y_train = train["price_doc"]

Data Clean...
Feature Engineering...
Rate Mults...


### Impute missing values, one hot encode categorical values, normalize the data

In [76]:
x_train = train.drop(["id", "timestamp", "price_doc", "average_q_price"], axis=1)
#x_test = test.drop(["id", "timestamp", "average_q_price"], axis=1)
x_test = test.drop(["id", "timestamp"], axis=1)

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])

for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values))
        x_all[c] = lbl.transform(list(x_all[c].values))

#indicator column for missing values and imputing the median for missing columns
from sklearn.preprocessing import Imputer, StandardScaler

imp = Imputer(missing_values='NaN', strategy='median', axis=0)

def add_na_indicator_columns(df):
    col_length = len(df)
    for i in df.columns:
        if df[i].isnull().values.any():
            temp_series = pd.Series(np.zeros(col_length) )
            temp_series[df[i].isnull().values] = 1
            new_col_name = 'nan_bool_'+i
            df[new_col_name] = pd.Series(temp_series,index=df.index)        

add_na_indicator_columns(x_all)
x_all = imp.fit_transform(x_all)

#normalize the data
X_scaler = StandardScaler()
x_all = X_scaler.fit_transform(x_all)

x_train = x_all[:num_train]
x_test = x_all[num_train:]

In [77]:
def describe_data_frame(df):
    print("Number of rows: " + str(len(df)))
    print("Number of columns: " + str(len(df.columns)))
    
    numeric_count = 0
    na_count = 0
    for i in df.columns:
        if np.issubdtype(df[i].dtype, np.number):
            numeric_count += 1
        if df[i].isnull().values.any():
            na_count += 1
    print("Number of numeric columns: {}".format(numeric_count))
    print("Number of columns with a NA value: {}".format(na_count))

def describe_features(df,target_column,is_target_categorical):
    for i in df.columns:
        #print('\n' + i + ' -- Feature ')
        details = ''
        
        if df[i].isnull().values.any():
            zFloat = len(df[df[i].isnull().values])/len(df[i]) 
            details += " | NaN " + str(np.round(zFloat,2))
            print(i,details,len(df[df[i].isnull().values]))

df_temp = pd.DataFrame(x_all)
describe_data_frame(df_temp)

Number of rows: 38058
Number of columns: 371
Number of numeric columns: 371
Number of columns with a NA value: 0


In [78]:
from sklearn.model_selection import train_test_split

x_train_2, x_val, y_train_2, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=31)

### Model 1 Simple Fit

In [82]:
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import optimizers

epsilon = 10e-9

def rmsle(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(K.clip(y_pred,epsilon,np.inf)+1.0) - 
                                  K.log(K.clip(y_true,epsilon,np.inf)+1.0)), axis=-1)) 

# def root_mean_squared_error(y_true, y_pred):
#         return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 
#adam_2 = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, clipnorm=1.)

model = Sequential()
model.add(Dense(450,input_dim=(371),activation='relu'))
#model.add(BatchNormalization())
model.add(Dense(225,activation='relu'))
model.add(Dense(1,activation='relu'))
#model.compile(loss=rmsle,optimizer='adam')
model.compile(loss='mean_squared_logarithmic_error',optimizer='adam')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=0),
    #callbacks.
    ModelCheckpoint('keras_checkpoints/simple', monitor='val_loss', verbose=0, 
                    save_best_only=True, save_weights_only=True, mode='auto', period=1)
]

model.fit(x_train_2,y_train_2,epochs=200,batch_size=128,validation_data=(x_val,y_val),callbacks=callbacks)


Train on 24316 samples, validate on 6080 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/20

<keras.callbacks.History at 0x1bbd1a20>

In [83]:
model_pred_1 = model.predict(x_test)

def output_predictions(name,id_col,predictions):
    df_out = pd.DataFrame({"id":id_col,"price_doc":predictions}) 
    print(df_out.head())
    df_out.to_csv(name+"_sberbank_submission.csv",index=False)
    
output_predictions("keras_2_simple",id_test,model_pred_1[:,0])

      id  price_doc
0  30474  4298447.5
1  30475  7388918.5
2  30476  4074422.5
3  30477  6374046.0
4  30478  4172210.0


In [84]:
#keras model 2
#using 64 component pca

from keras.layers import Dropout

from sklearn.decomposition import PCA

pca = PCA(n_components = 64,random_state=0)
pca_all = pca.fit_transform(x_all)
pca_x_train = pca_all[:num_train]
pca_x_test = pca_all[num_train:]

pca_x_train_2, pca_x_val, pca_y_train_2, pca_y_val = train_test_split(pca_x_train, y_train, test_size=0.2, random_state=31)

print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))

[ 0.24999216  0.12657392  0.0600045   0.05633772  0.03547774  0.02812155
  0.02344738  0.02032039  0.01883119  0.0178446   0.01572641  0.01355604
  0.01346917  0.01111229  0.01076332  0.01057343  0.00991622  0.00895608
  0.00832012  0.0082516   0.00729814  0.00673189  0.00651013  0.00630501
  0.00605212  0.00594904  0.00554553  0.00540598  0.00540158  0.00513978
  0.00481644  0.00460194  0.00447586  0.00430218  0.00421243  0.00406572
  0.00400286  0.00394177  0.0039007   0.00357575  0.00351459  0.00344585
  0.00336151  0.00329017  0.00316343  0.00304065  0.00292467  0.00283409
  0.00272798  0.00270844  0.00270373  0.00264536  0.00263292  0.00259292
  0.00255301  0.00251074  0.00246334  0.00235041  0.00231534  0.00220009
  0.00216769  0.00216024  0.00211754  0.00207456]
0.914329965393


In [85]:

model_2 = Sequential()
model_2.add(Dense(128,input_dim=(64),activation='elu'))
model_2.add(BatchNormalization())
#model_2.add(Dropout(0.5))
model_2.add(Dense(128,input_dim=(64),activation='elu'))
model_2.add(BatchNormalization())
#model_2.add(Dropout(0.5))
model_2.add(Dense(128,activation='elu'))
model_2.add(Dense(1,activation='linear'))
#model_2.compile(loss=rmsle,optimizer='adam')
model_2.compile(loss='mean_squared_logarithmic_error',optimizer='adam')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=0),
    #callbacks.
    ModelCheckpoint('keras_checkpoints/model_2_bn', monitor='val_loss', verbose=0, 
                    save_best_only=True, save_weights_only=True, mode='auto', period=1)
]

model_2.fit(pca_x_train_2,pca_y_train_2,epochs=200,batch_size=128,validation_data=(pca_x_val,pca_y_val),callbacks=callbacks)


Train on 24316 samples, validate on 6080 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/20

<keras.callbacks.History at 0x1729aeb8>

In [87]:
model_pred_2 = model_2.predict(pca_x_test)

output_predictions("keras_2_model_2_bn",id_test,model_pred_2[:,0])

      id  price_doc
0  30474  3612644.0
1  30475  8047797.0
2  30476  4315007.0
3  30477  4484490.0
4  30478  3340141.5


In [88]:
#model 3, deeper with dropout

model_3 = Sequential()
model_3.add(Dense(256,input_dim=(64),activation='relu'))
#model_3.add(BatchNormalization())
model_3.add(Dropout(0.5))
model_3.add(Dense(256,input_dim=(64),activation='relu'))
#model_3.add(BatchNormalization())
model_3.add(Dropout(0.5))
model_3.add(Dense(128,activation='relu'))
model_3.add(Dense(1,activation='linear'))
#model_3.compile(loss=rmsle,optimizer='adam')
model_3.compile(loss='mean_squared_logarithmic_error',optimizer='adam')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=0),
    #callbacks.
    ModelCheckpoint('keras_checkpoints/model_3', monitor='val_loss', verbose=0, 
                    save_best_only=True, save_weights_only=True, mode='auto', period=1)
]

model_3.fit(pca_x_train_2,pca_y_train_2,epochs=200,batch_size=128,validation_data=(pca_x_val,pca_y_val),callbacks=callbacks)


Train on 24316 samples, validate on 6080 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/20

<keras.callbacks.History at 0x17b0bef0>

In [89]:
model_pred_3 = model_3.predict(pca_x_test)

output_predictions("keras_2_model_3",id_test,model_pred_3[:,0])

      id   price_doc
0  30474  3551221.50
1  30475  7702081.00
2  30476  3903168.75
3  30477  4534123.00
4  30478  3573680.00


In [94]:
#model 4 deeper with he_normal weight initializers

model_4 = Sequential()
model_4.add(Dense(512,input_dim=(371),activation='relu',kernel_initializer='he_normal'))
model_4.add(Dropout(0.5))
model_4.add(Dense(256,activation='relu',kernel_initializer='he_normal'))
model_4.add(Dropout(0.5))
model_4.add(Dense(128,activation='relu',kernel_initializer='he_normal'))
model_4.add(Dropout(0.5))
model_4.add(Dense(1,activation='linear'))
#model_4.compile(loss=rmsle,optimizer='adam')
model_4.compile(loss='mean_squared_logarithmic_error',optimizer='adam')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=0),
    #callbacks.
    ModelCheckpoint('keras_checkpoints/model_5', monitor='val_loss', verbose=0, 
                    save_best_only=True, save_weights_only=True, mode='auto', period=1)
]

model_4.fit(x_train_2,y_train_2,epochs=300,batch_size=128,validation_data=(x_val,y_val),callbacks=callbacks)


Train on 24316 samples, validate on 6080 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/30

<keras.callbacks.History at 0x217cea58>

In [95]:
model_pred_4 = model_4.predict(x_test)

output_predictions("keras_2_model_6",id_test,model_pred_4[:,0])

      id   price_doc
0  30474  4588600.00
1  30475  7481042.50
2  30476  4112582.75
3  30477  6712090.00
4  30478  4289107.50


### best model was the 3 layer, 128 batch like 500 128 64 size

In [44]:
# convenience function to add indicator columns for variables with a NA value
def add_na_indicator_columns(df):
    col_length = len(df)
    for i in df.columns:
        if df[i].isnull().values.any():
            temp_series = pd.Series(np.zeros(col_length) )
            temp_series[df[i].isnull().values] = 1
            new_col_name = 'nan_bool_'+i
            df[new_col_name] = pd.Series(temp_series,index=df.index)
            #df_test_data['latitude_modified'] = pd.Series(latitude_modified,index=df_test_data.index)