In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [8]:
from sklearn.metrics import mean_absolute_error,accuracy_score,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import statsmodels.api as sm

In [9]:
df = pd.read_csv('../data/min_max_df.csv', parse_dates=['date'], index_col=[0])

In [10]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,7129300520,2014-10-13,0.01888,0.222222,0.066667,0.061503,0.003108,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,6414100192,2014-12-09,0.060352,0.222222,0.233333,0.167046,0.004072,0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5631500400,2015-02-25,0.013382,0.111111,0.066667,0.030372,0.005743,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2487200875,2014-12-09,0.069011,0.333333,0.333333,0.120729,0.002714,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1954400510,2015-02-18,0.056678,0.222222,0.2,0.099468,0.004579,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
new_feats = ['sqft_living', 'yr_built', 'PctofLot', 'age', 'waterfront', 'sqft_lot',
        'view', 'bedrooms', 'condition', 'floors']

In [12]:
oheNames = [98001, 98002, 98003, 98004, 98005, 98006, 98007, 98008, 98010,
        98011, 98014, 98019, 98022, 98023, 98024, 98027, 98028, 98029,
        98030, 98031, 98032, 98033, 98034, 98038, 98039, 98040, 98042,
        98045, 98052, 98053, 98055, 98056, 98058, 98059, 98065, 98070,
        98072, 98074, 98075, 98077, 98092, 98102, 98103, 98105, 98106,
        98107, 98108, 98109, 98112, 98115, 98116, 98117, 98118, 98119,
        98122, 98125, 98126, 98133, 98136, 98144, 98146, 98148, 98155,
        98166, 98168, 98177, 98178, 98188, 98198, 98199]

In [13]:
strOheNames = [str(i) for i in oheNames]

In [14]:
new_feats

['sqft_living',
 'yr_built',
 'PctofLot',
 'age',
 'waterfront',
 'sqft_lot',
 'view',
 'bedrooms',
 'condition',
 'floors']

In [15]:
new_feats = new_feats + strOheNames

In [16]:
X = df[new_feats]
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [17]:
models = {'Lasso':Lasso(alpha=1.0, tol=.01), 
          'Ridge':Ridge(alpha=1.0), 
          'RandForest':RandomForestRegressor(n_jobs=-1), 
          'XGB':XGBRegressor(), 
          'LightGBM':LGBMRegressor()}

In [18]:
for key, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = model.score(X_test, y_test)
    MAE = mean_absolute_error(y_test, preds)
    print(key, '| MSE =', score, '| MAE =', MAE)

Lasso | MSE = -9.831193018383644e-05 | MAE = 0.03105367332240898
Ridge | MSE = 0.7910002283417119 | MAE = 0.013103903760436058
RandForest | MSE = 0.8000839696415696 | MAE = 0.011850092252051156
XGB | MSE = 0.8315112843462551 | MAE = 0.010889055218647454
LightGBM | MSE = 0.8165871023626173 | MAE = 0.01117594034410866


In [19]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [34]:
keras.backend.clear_session()

In [29]:
early_stopping = EarlyStopping(
    min_delta=0.001, 
    patience=20, 
    restore_best_weights=True,
)

kerasModel = keras.Sequential([
    layers.Dense(32, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4), input_shape=[80,]),
    #layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(64, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    #layers.Dropout(0.3),
    layers.Dense(128, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    #layers.Dropout(0.5),
    layers.Dense(256, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)),
    layers.Dense(1),
])

In [30]:
optimizer = keras.optimizers.SGD(lr=0.001)

kerasModel.compile(
    optimizer=optimizer,
    loss='mean_absolute_error',
    metrics=['mean_absolute_error'],
)

history = kerasModel.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=30,
    verbose=1,
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [35]:
full_feats = ['bedrooms','bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
              'waterfront', 'view', 'condition', 'grade', 'sqft_above', 
              'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 
              'sqft_lot15', 'renovated', 'since_reno', 'has_basement', 
              'basement_lot_pct', 'aboveground_lot_pct', 'PctofLot']
full_feats = full_feats + strOheNames

In [36]:
X = df[full_feats]
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [37]:
X_train.shape

(17276, 91)

In [38]:
keras.backend.clear_session()

In [39]:
early_stopping = EarlyStopping(
    min_delta=0.001, 
    patience=20, 
    restore_best_weights=True,
)

kerasModel = keras.Sequential([
    layers.Dense(32, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4), input_shape=[91,]),
    #layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(64, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    #layers.Dropout(0.3),
    layers.Dense(128, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    #layers.Dropout(0.5),
    layers.Dense(256, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-4)),
    layers.Dense(1),
])

In [40]:
optimizer = keras.optimizers.SGD(lr=0.001)

kerasModel.compile(
    optimizer=optimizer,
    loss='mean_absolute_error',
    metrics=['mean_absolute_error'],
)

history = kerasModel.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=30,
    verbose=1,
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
#So far, XGB is our best bet. Let's do another sheet of only TF models and see if we can't get closer to MAE