In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
from sklearn.metrics import mean_absolute_error,accuracy_score,mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/min_max_df.csv', parse_dates=['date'], index_col=[0])

In [5]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,7129300520,2014-10-13,0.01888,0.222222,0.066667,0.061503,0.003108,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,6414100192,2014-12-09,0.060352,0.222222,0.233333,0.167046,0.004072,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5631500400,2015-02-25,0.013382,0.111111,0.066667,0.030372,0.005743,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2487200875,2014-12-09,0.069011,0.333333,0.333333,0.120729,0.002714,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1954400510,2015-02-18,0.056678,0.222222,0.2,0.099468,0.004579,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month

In [7]:
ohe = pd.get_dummies(df.year, sparse=False)
df = df.join(ohe)

In [8]:
ohe = pd.get_dummies(df.month, sparse=False)
df = df.join(ohe)

In [9]:
df.columns

Index([         'id',        'date',       'price',    'bedrooms',
         'bathrooms', 'sqft_living',    'sqft_lot',      'floors',
        'waterfront',        'view',
       ...
                   3,             4,             5,             6,
                   7,             8,             9,            10,
                  11,            12],
      dtype='object', length=115)

In [10]:
full_feats = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'renovated', 'since_reno',
       'has_basement', 'basement_lot_pct', 'aboveground_lot_pct', 'PctofLot',
       'age', 'likelyApartment', '98001', '98002', '98003', '98004', '98005',
       '98006', '98007', '98008', '98010', '98011', '98014', '98019', '98022',
       '98023', '98024', '98027', '98028', '98029', '98030', '98031', '98032',
       '98033', '98034', '98038', '98039', '98040', '98042', '98045', '98052',
       '98053', '98055', '98056', '98058', '98059', '98065', '98070', '98072',
       '98074', '98075', '98077', '98092', '98102', '98103', '98105', '98106',
       '98107', '98108', '98109', '98112', '98115', '98116', '98117', '98118',
       '98119', '98122', '98125', '98126', '98133', '98136', '98144', '98146',
       '98148', '98155', '98166', '98168', '98177', '98178', '98188', '98198',
       '98199']

In [11]:
X = df[full_feats]
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [12]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [17]:
keras.backend.clear_session()

In [18]:
early_stopping = EarlyStopping(
    min_delta=0.001, 
    patience=20, 
    restore_best_weights=True,
)

kerasModel = keras.Sequential([
    layers.Dense(32, activation='sigmoid', input_shape=[X_train.shape[1],], kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    #layers.Dropout(.2),
    layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    #layers.Dropout(.5),
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Dropout(.5),
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Dropout(.5),
    layers.Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Dense(1)
])
    

In [19]:
optimizer = keras.optimizers.Adam(lr=0.0005)  #could try .0005 to train longer

kerasModel.compile(
    optimizer=optimizer,
    loss='mean_absolute_error',
    metrics=['mean_absolute_error'],
)

history = kerasModel.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=64,
    epochs=100,
    verbose=1,
)

Epoch 1/100
Epoch 2/100
Epoch 3/100

KeyboardInterrupt: 

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['mean_absolute_error', 'val_mean_absolute_error']].plot(title="MAE")
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))

In [None]:
#kerasModel.save('DeepModel1')

In [20]:
#kerasModel = keras.models.load_model('DeepModel1')

In [98]:
preds =kerasModel.predict(X)

In [95]:
resid = pd.DataFrame()
resid['target'] = df.price

In [99]:
resid['preds'] = preds
resid['resid'] = resid.target - resid.preds

In [104]:
resid.resid.mean()

0.0017349044394082572

In [105]:
resid.to_csv('KerasPreds')