In [94]:
# Import libraries
import numpy as np
import pandas as pd
import hvplot.pandas
import random

In [95]:
# Import ML libraries
from numpy.random import seed
from tensorflow import random as ts_random

# Augmented Dickey-Fuller test
from statsmodels.tsa.stattools import adfuller

In [6]:
# Set ML algorithm seeds
seed(1)
ts_random.set_seed(2)

In [7]:
# Import csv file and change index to datetime
seattle_neighborhoods_df = pd.read_csv("seattle-neighborhoods.csv", index_col="Month of Period End", infer_datetime_format=True, parse_dates=True)
# Fill empty values with previous row's values in that column
seattle_neighborhoods_df = seattle_neighborhoods_df.fillna(method="pad")
# Turn datetime index into a column
seattle_neighborhoods_df.reset_index(inplace=True)
# Convert colum to strings
seattle_neighborhoods_df['Median Sale Price'] = seattle_neighborhoods_df['Median Sale Price'].astype(str)
# Remove leading and trailing whitespaces in column headers
seattle_neighborhoods_df.columns = seattle_neighborhoods_df.columns.str.strip()

In [8]:
# Create MultiIndex of dataframe to sort by both date and Region
seattle_neighborhoods_df.set_index(['Month of Period End', 'Region'], inplace=True)
# Sort index by neighborhood and then date, ascending both columns
seattle_neighborhoods_df.sort_index(level=["Month of Period End", 'Region'], ascending=[1, 1], inplace=True)

In [9]:
# Define characters to strip from colum values AFTER Index has been defined
# In order to prevent the index columns from being stripped as well
dictionary = {'\$':'', 'K':'000', ',':'', '\%':''} 
seattle_neighborhoods_df.replace(dictionary, regex=True, inplace=True)
# Convert all column values to numeric values
seattle_neighborhoods_df =  seattle_neighborhoods_df.apply(pd.to_numeric)

In [10]:
# Contain column values as variables for easier reference

# 0 - 'Median Sale Price'
# 1 - 'Median Sale Price MoM',
# 2 - 'Median Sale Price YoY',
msp = seattle_neighborhoods_df.columns.values[0]
msp_mom = seattle_neighborhoods_df.columns.values[1]
msp_yoy = seattle_neighborhoods_df.columns.values[2]

# 3 - 'Homes Sold',
# 4 - 'Homes Sold MoM',
# 5 - 'Homes Sold YoY',
hs = seattle_neighborhoods_df.columns.values[3]
hs_mom = seattle_neighborhoods_df.columns.values[4]
hs_yoy = seattle_neighborhoods_df.columns.values[5]

# 6 - 'New Listings',
# 7 - 'New Listings MoM',
# 8 - 'New Listings YoY',
nl = seattle_neighborhoods_df.columns.values[6]
nl_mom = seattle_neighborhoods_df.columns.values[7]
nl_yoy = seattle_neighborhoods_df.columns.values[8]

# 9 - 'Inventory',
# 10 - 'Inventory MoM',
# 11 - 'Inventory YoY',
inv = seattle_neighborhoods_df.columns.values[9]
inv_mom = seattle_neighborhoods_df.columns.values[10]
inv_yoy = seattle_neighborhoods_df.columns.values[11]

# 12 - 'Days on Market',
# 13 - 'Days on Market MoM',
# 14 - 'Days on Market YoY',
dom = seattle_neighborhoods_df.columns.values[12]
dom_mom = seattle_neighborhoods_df.columns.values[13]
dom_yoy = seattle_neighborhoods_df.columns.values[14]

# 15 - 'Average Sale To List',
# 16 - 'Average Sale To List MoM'
# 17 - 'Average Sale To List YoY'
asl = seattle_neighborhoods_df.columns.values[15]
asl_mom = seattle_neighborhoods_df.columns.values[16]
asl_yoy = seattle_neighborhoods_df.columns.values[17]


In [11]:
# Create practical arrays of values for easy reference

# Main attributes
main_values = [msp, hs, nl, inv, dom, asl]
# Month-over-month values
mom_values = [msp_mom, hs_mom, nl_mom, inv_mom, dom_mom, asl_mom]
# Year-over-year values
yoy_values = [msp_yoy, hs_yoy, nl_yoy, inv_yoy, dom_yoy, asl_yoy]
# And just in case we need to explicitly request all columns and Pandas isn't behaving
all_values = [msp, msp_mom, msp_yoy, hs, hs_mom, hs_yoy, nl, nl_mom, nl_yoy, inv, inv_mom, inv_yoy, dom, dom_mom, dom_yoy, asl, asl_mom, asl_yoy]

In [97]:
# ADF Returns values - printed here for reference
#     -------
#     [0]adf : float
#         The test statistic.
#     [1]pvalue : float
#         MacKinnon"s approximate p-value based on MacKinnon (1994, 2010).
#     [2]usedlag : int
#         The number of lags used.
#     [3]nobs : int
#         The number of observations used for the ADF regression and calculation
#         of the critical values.
#     [4]critical values : dict
#         Critical values for the test statistic at the 1 %, 5 %, and 10 %
#         levels. Based on MacKinnon (2010).
#     [5]icbest : float
#         The maximized information criterion if autolag is not None.
#     [6]resstore : ResultStore, optional
#         A dummy class with results attached as attributes.

# Select a neighborhood randomly
hood = random.choice(seattle_neighborhoods_df.index.get_level_values("Region").unique())

# Contain specific instance of neighborhood and value
df = seattle_neighborhoods_df.xs(hood, level='Region')[msp]

# Just trying to keep the code clean...
result = adfuller(df)

print(hood)
print(f'ADF Statistic:')
print(f'    {result[0]}')
print(f'p-value:')
print(f'    {result[1]}')
print(f'n_lags:')
print(f'    {result[2]}')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')
    
df.hvplot(title=hood)

Seattle, WA - First Hill
ADF Statistic:
    -2.200890901188599
p-value:
    0.20592227002810432
n_lags:
    0
Critial Values:
   1%, -3.4948504603223145
Critial Values:
   5%, -2.889758398668639
Critial Values:
   10%, -2.5818220155325444


In [98]:
# Contain the full list of Seattle neighborhods as an array for easy reference
neighborhood_list = seattle_neighborhoods_df.index.get_level_values("Region").unique()

In [102]:
# Iterate through every neighborhood and run the ADF test on values
# Define a start and stop to contain a range of neighborhoods
start = 45
stop = 50

for neighborhood in neighborhood_list[start:stop]:
    # Run ADF on Seattle data, looking at Median Sale Price
    print(f'{neighborhood}')
    for i in main_values:
        print(f"{i} - ADF: {adfuller(seattle_neighborhoods_df.xs(neighborhood, level='Region')[i])[0]} \n ")
        # print(  adfuller(seattle_neighborhoods_df.xs(neighborhood, level='Region')[msp])[1])
    print()

# For clarification, the above line `seattle_neighborhoods_df.xs(neighborhood, level='Region')[i])[0]` takes the FULL dataframe,
#   uses Pandas cross section (xs) to slice to one of the two indexes (`Month of Period End` and `Region`)
#   referenced by the `neighborhood` variable
#   and the index level is specified by the `level` parameter
#   then the particular column of data in that neighborhood is selected from the returned array with the [i]
#   which pulls a new value during each iteration as the `for loop` traverses the `main_values` array that
#   contains shorthand variables for each column in the original dataframe

Seattle, WA - Lake City
Median Sale Price - ADF: -0.6815144552172131 
 
Homes Sold - ADF: -2.247586411453391 
 
New Listings - ADF: -3.6964911185475295 
 
Inventory - ADF: -1.9580636661168263 
 
Days on Market - ADF: -3.688604472106919 
 
Average Sale To List - ADF: -2.3772813117458185 
 

Seattle, WA - Lake View
Median Sale Price - ADF: -2.627204227693555 
 
Homes Sold - ADF: -2.028855417011148 
 
New Listings - ADF: -3.5917575270429927 
 
Inventory - ADF: 1.1182173334664238 
 
Days on Market - ADF: -0.7510613011155549 
 
Average Sale To List - ADF: -1.8014124887617822 
 

Seattle, WA - Laurelhurst
Median Sale Price - ADF: -2.9527885186622207 
 
Homes Sold - ADF: -2.728024258405608 
 
New Listings - ADF: -1.5593847282856115 
 
Inventory - ADF: -4.237764152954558 
 
Days on Market - ADF: -5.694628194035351 
 
Average Sale To List - ADF: -2.080954695944824 
 

Seattle, WA - Lawton Park
Median Sale Price - ADF: -0.9924587145147495 
 
Homes Sold - ADF: -3.7617598891932236 
 
New Listings 

In [103]:
# Plot time-series values
seattle_neighborhoods_df[mom_values].hvplot(ylabel="Price", groupby="Region")

In [114]:
# Contain lstm Training data in a function
def window_data(df, window, feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        features = df.iloc[i:(i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [115]:
data_hood = seattle_neighborhoods_df.xs(neighborhood_list[0], level='Region')
# Define variables to run previously defined function
window_size = 12
# 18 columns of potential training data, use all of them
feature_column = 3
target_column = 0
# 
X, y = window_data(data_hood, window_size, feature_column, target_column)

In [117]:
data_hood.iloc[:,2:4]

Unnamed: 0_level_0,Median Sale Price YoY,Homes Sold
Month of Period End,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-02-01,-5.9,495
2012-03-01,0.0,670
2012-04-01,7.2,778
2012-05-01,9.4,890
2012-06-01,8.1,927
...,...,...
2020-06-01,5.3,1020
2020-07-01,7.4,1224
2020-08-01,12.4,1195
2020-09-01,9.7,1301


In [118]:
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [119]:
from sklearn.preprocessing import MinMaxScaler
# Use the MinMaxScaler to scale data between 0 and 1.
scaler = MinMaxScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
scaler.fit(y)
y_train = scaler.transform(y_train)
y_test = scaler.transform(y_test)

In [120]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [121]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [122]:
# Build the LSTM model. 

model = Sequential()

number_units = 5
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(
    units=number_units,
# The return sequences need to be set to True if you are adding additional LSTM layers, but 
# You don't have to do this for the final layer. 
    return_sequences=True,
# Note: The input shape is the number of time steps and the number of indicators
# Note: Batching inputs has a different input shape of Samples/TimeSteps/Features
    input_shape=(X_train.shape[1], 1))
    )
# Note: The dropouts help prevent overfitting
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

In [123]:
model.compile(optimizer="adam", loss="mean_squared_error")

In [124]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 12, 5)             140       
_________________________________________________________________
dropout_3 (Dropout)          (None, 12, 5)             0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 12, 5)             220       
_________________________________________________________________
dropout_4 (Dropout)          (None, 12, 5)             0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 5)                 220       
_________________________________________________________________
dropout_5 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [125]:
# Train the model
model.fit(
    X_train,
    y_train,
# Use at least 10 epochs
    epochs=10,
# Do not shuffle the data
    shuffle=False,
# Experiement with the batch size, but a smaller batch size is recommended
    batch_size=1,
    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x28c07e42bb0>

In [126]:
# Evaluate the model
model.evaluate(X_test, y_test)



0.1447729766368866

In [127]:
predicted = model.predict(X_test)

In [128]:
predicted_prices = scaler.inverse_transform(predicted)
real_prices = scaler.inverse_transform(y_test.reshape(-1, 1))

In [129]:
neighborhood = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
}, index = df.index[-len(real_prices): ]) 
neighborhood.tail()

Unnamed: 0_level_0,Real,Predicted
Month of Period End,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-01,700000.0,571560.875
2020-07-01,742000.0,557655.5
2020-08-01,740000.0,547327.9375
2020-09-01,759000.0,538817.875
2020-10-01,745000.0,535271.1875


In [130]:
neighborhood.hvplot()

In [136]:
import plotly.graph_objects as go

fig = go.Figure()

fig.update_layout(
    mapbox = {
        'style': "stamen-toner",
        'center': { 'lat': 47.60636771204824, 'lon': -122.33452897024857},
        'zoom': 10, 'layers': [{
            'source': {
                'type': "FeatureCollection",
                'features': [{
                    'type': "Map",
                    'geometry': {
                        'type': "MultiPolygon",
                        'coordinates': [[[
                           [-122.376336564723,47.6759176989664], [-122.376519722058,47.6760527482053],
                           [-122.376547112434,47.6760693435982], [-122.376573006221,47.6760868163511],
                           [-122.37659791092,47.6761051588887 ], [-122.376620811506,47.6761243856628],
                           [-122.376653693551,47.6761550023704], [-122.376672575492,47.6761756116819],
                           [-122.376690473449,47.6761972626929], [-122.376706358898,47.6762194979699], 
                           [-122.376721246367,47.6762423037704], [-122.376733608705,47.6762655291531],
                           [-122.376744969247,47.6762891966531], [-122.376753802121,47.6763131984825],
                           [-122.376761120573,47.6763374777451], [-122.37676692258,47.6763619488292 ],
                           [-122.376770700102,47.6763866189683], [-122.376772451106,47.6764114022001],
                           [-122.376772171763,47.6764361697665], [-122.376761985459,47.676862137338 ],
                           [-122.376759750881,47.6775387646826], [-122.376769882216,47.6780678404081],
                           [-122.376770065639,47.6792017979916], [-122.376770442503,47.680154307228 ],
                           [-122.376767820487,47.6811938310584], [-122.376777423242,47.6822860591882],
                           [-122.376787310842,47.6833707854178], [-122.376780535885,47.6842704786057],
                           [-122.376785260609,47.6851813698577], [-122.376784403749,47.6860923366759],
                           [-122.376800402808,47.6870068457338], [-122.376794751014,47.6879443550321],
                           [-122.376803067488,47.688976146724 ], [-122.376810989127,47.6900117143006],
                           [-122.376810264834,47.6905341242701], [-122.376810344766,47.6906051591463],
                           [-122.377117910432,47.6906029639028], [-122.379103841332,47.6906041929746],
                           [-122.379876313408,47.6905988541749], [-122.381453020656,47.6905978829044],
                           [-122.383201215024,47.6905936585447], [-122.383275316007,47.690593164597 ],
                           [-122.385832831602,47.6905766041982], [-122.388192979124,47.6905805029145],
                           [-122.390347910813,47.6905631204492], [-122.392432373275,47.6905826938111],
                           [-122.393002415356,47.6905813205566], [-122.393010665051,47.6902980913372],
                           [-122.393046187318,47.6889956475812], [-122.393032746621,47.687668638629 ],
                           [-122.393030577589,47.6863450742964], [-122.393022827056,47.685021586284 ],
                           [-122.393016158048,47.6837341155494], [-122.393014530791,47.6824285808909],
                           [-122.39300195772,47.6811303942788 ], [-122.392983771129,47.6798142464259],
                           [-122.392975009807,47.6792517755972], [-122.392971502986,47.6781034200237],
                           [-122.392961263359,47.6775085790216], [-122.392959618409,47.676929558447 ],
                           [-122.392956303464,47.6763287099834], [-122.392956550545,47.6759141859541],
                           [-122.392827172066,47.6759153143353], [-122.391773109273,47.6759158131504],
                           [-122.390449346263,47.6759259868868], [-122.388955759168,47.6759424488678],
                           [-122.387702391216,47.6759476875929], [-122.387087434206,47.6759521343769],
                           [-122.385435721425,47.6759587956618], [-122.383602752477,47.675979810424 ],
                           [-122.38182788443,47.675992079457  ], [-122.379988625303,47.6759893429385],
                           [-122.377903057631,47.6759978849021], [-122.376707907517,47.6759982290459],
                           [-122.376336564723,47.6759176989664]
                        ]]]
                    }
                }]
            },
            'type': "fill", 'below': "traces", 'color': 'purple'}]},
    margin = {'l':0, 'r':0, 'b':0, 't':0})

fig.show()