In [1]:
import pandas as pd
from keras.utils import to_categorical

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from pylab import rcParams 
rcParams['figure.figsize'] = 20, 10 
rcParams['font.size'] = 16

from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
def prepare_dataset(df, categorical_as_onehot=True):
    df = df.fillna(-1)
    for column in df.columns:
        df = getObjectFeature(df, column, categorical_as_onehot=categorical_as_onehot)
        
    X = df.drop(['Id', 'SalePrice'], axis=1).as_matrix().astype('float')
    y = df['SalePrice'].as_matrix().astype('float')
    return np.nan_to_num(X), np.nan_to_num(y), df['Id'].as_matrix()

   
def getObjectFeature(df, col, categorical_as_onehot=True):
    if df[col].dtype != 'object': # if it's not categorical do nothing
        df['{}_sqrt'.format(col)] = df[col].pow(.5)
        
        return df
    else:
        if categorical_as_onehot:
            df1 = df
            counts = df1[col].value_counts()

            one_hot = to_categorical([counts.index.tolist().index(i) for i in df1[col]])

            df1 = df1.drop([col], axis=1)
            for i in range(one_hot.shape[1]):
                df1['{}_{}'.format(col, i)] = one_hot[:, i]
            return df1
        else:
            df1 = df
            counts = df1[col].value_counts()
            df1[col] = [counts.index.tolist().index(i) for i in df1[col]]
            return df1

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

X, y, ids = prepare_dataset(pd.concat([df_train, df_test]), categorical_as_onehot=True)

X_train = X[y != -1]
y_train = y[y != -1]

X_test = X[y == -1]
ids_test = ids[y == -1]

In [4]:
X_train.shape

(1460, 349)

In [5]:
y_train.shape

(1460,)

In [6]:
input = Input(shape=(X_train.shape[1],))
output = Dense(1)(input)

model = Model(inputs=input, outputs=output)

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

model.compile(loss=root_mean_squared_error, optimizer=Adam(lr=0.04))
model.fit(X_train, y_train, epochs=40, batch_size=20, validation_split=0.1)

Train on 1314 samples, validate on 146 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x3019b8eb8>

In [7]:
y_pred = model.predict(X_test)

In [8]:
y_pred

array([[132396.31 ],
       [127924.76 ],
       [180732.92 ],
       ...,
       [181634.39 ],
       [ 96559.625],
       [212327.98 ]], dtype=float32)

In [9]:
pd.DataFrame(data={'Id': ids_test, 'SalePrice': np.squeeze(y_pred)}).to_csv('data/submission.csv', index=False)