## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [2]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "3157 NE MARINE DR, Portland OR 97035",
    "bathrooms": 1.0,
    "bedrooms": 1,
    "built": 1964,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Faubion",
    "high_school": "Current Price:",
    "home_type": "Floating Home - 1 Story",
    "lot_size": null,
    "middle_school": "Jefferson",
    "neighborhood": "unknown",
    "price": 65000,
    "square_feet": 800,
    "zipcode": 97035
}


In [3]:
# Create a dataframe to use for our model.
housing_data = pd.DataFrame(listings_json)

print(len(housing_data))
housing_data.head()

3831


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"3157 NE MARINE DR, Portland OR 97035",1.0,1,1964,Portland,Multnomah,Faubion,Current Price:,Floating Home - 1 Story,,Jefferson,unknown,65000,800,97035
1,"17452 NE GLISAN ST #7, Portland OR 97230",2.0,2,1988,Portland,Multnomah,Hartley,Reynolds,Manufactured - Double Wide Manufact,,Reynolds,unknown,72000,1152,97230
2,"9034 SE 78TH PL, Portland OR 97206",2.0,3,1997,Portland,Clackamas,Whitman,Current Price:,Manufactured - Double Wide Manufact,,Milwaukie,unknown,79950,1344,97206
3,"16000 SE POWELL BLVD 75, Portland OR 97236",2.0,3,1990,Portland,Multnomah,Powell Butte,Centennial,Manufactured - Double Wide Manufact,,Centennial,unknown,79950,1404,97236
4,"12846 SE RAMONA ST 6, Portland OR 97236",2.0,3,1997,Portland,Multnomah,Gilbert Hts,David Douglas,Manufactured - Double Wide Manufact,,Alice Ott,unknown,93900,1297,97236


# Data Cleaning

In [4]:
# Simplify home types 
for i in housing_data.index:
    if "Floating" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Floating"
    if "Condo" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Condo"
    if "Single Family" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Single Family"
    if "Manufactured" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Manufactured"
    
housing_data.home_type.unique() 

array(['Floating', 'Manufactured', 'Condo', 'Single Family',
       'Co-Op Housing - Contemporary'], dtype=object)

In [5]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(housing_data)}')

# Change lot size to 0 for floating homes and condos
for i in housing_data.index:
    if housing_data.at[i, "home_type"] == "Floating":
        housing_data.at[i, "lot_size"] = 0
    if housing_data.at[i, "home_type"] == "Condo":
        housing_data.at[i, "lot_size"] = 0

# Drop listing with null lot_size
cleaned_housing_data = housing_data.drop(housing_data[housing_data["lot_size"].isnull()].index)
      
# Print length of data
print(f'Updated Amount of Listings: {len(cleaned_housing_data)}')

Current Amount of Listings: 3831
Updated Amount of Listings: 3655


In [6]:
# Drop listings with unclear Highschool data
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Current Price:"].index, inplace = True)
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Other"].index, inplace = True)
cleaned_housing_data.shape

(3640, 15)

In [7]:
# Create a cost ranker based on zipcode
zipcode = cleaned_housing_data[["price","zipcode"]]
zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
zipcodeRanker = zipcodeAVG.reset_index(drop=False)
zipcodeRanker.reset_index(drop=False, inplace=True)
zipcodeRanker.rename(columns={"index":"zipcode_rank","price":"zipcodeAVGcost"}, inplace=True)
zipcodeRanker["zipcode_rank"]=zipcodeRanker["zipcode_rank"]+1


# Merge into df
cleaned_housing_data2 = pd.merge(cleaned_housing_data, zipcodeRanker, on="zipcode")
cleaned_housing_data2.rename(columns={"price_y":"zipcodeAVGcost"}, inplace = True)
cleaned_housing_data2.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,zipcode_rank,zipcodeAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,unknown,129500,735,97230,30,402397.25
1,"3389 NE 162ND AVE, Portland OR 97230",2.0,2,1979,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,Fremont Village Park,160000,1073,97230,30,402397.25
2,"19609 NE MARINE DR E1, Portland OR 97230",2.0,3,1945,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,Big Eddy Marina,224500,1150,97230,30,402397.25
3,"15041 NE SISKIYOU CT, Portland OR 97230",2.0,2,1973,Portland,Multnomah,Scott,Reynolds,Condo,0.0,H.B. Lee,unknown,229900,1638,97230,30,402397.25
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",2.0,2,1986,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,SUMMERPLACE,239000,1128,97230,30,402397.25


In [8]:
# Create district df
school_dict = ({"high_school" : ['Reynolds', 'Parkrose', 'David Douglas', 'Centennial', 'Cleveland',
        'Lincoln', 'Madison', 'Jefferson', 'Roosevelt', 'Sunset','Westview', 'Liberty', 'Beaverton', 
        'Grant', 'Southridge', 'Tigard', 'Wilson', 'Riverdale', 'Lake Oswego', 'Franklin',
        'Tualatin', 'Milwaukie', 'Scappoose'], "district" : ['Reynolds', 'Parkrose','David Douglas',
        'Centennial', 'Portland Public', 'Portland Public', 'Portland Public', 'Portland Public',
        'Portland Public', 'Beaverton', 'Beaverton', 'Hillsboro', 'Beaverton', 'Portland Public',
        'Beaverton', 'Tigard-Tualatin', 'Portland Public', 'Riverdale', 'Lake Oswego', 'Portland Public',
        'Tigard-Tualatin', 'North Clackamas', 'Scappose']})
district_df = pd.DataFrame (school_dict)

# Merge into OG df
cleaned_housing_data3 = pd.merge(cleaned_housing_data2, district_df, on="high_school")
cleaned_housing_data3.head()

Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode,zipcode_rank,zipcodeAVGcost,district
0,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,unknown,129500,735,97230,30,402397.25,Reynolds
1,"3389 NE 162ND AVE, Portland OR 97230",2.0,2,1979,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,Fremont Village Park,160000,1073,97230,30,402397.25,Reynolds
2,"19609 NE MARINE DR E1, Portland OR 97230",2.0,3,1945,Portland,Multnomah,Salish Pond,Reynolds,Floating,0.0,Reynolds,Big Eddy Marina,224500,1150,97230,30,402397.25,Reynolds
3,"15041 NE SISKIYOU CT, Portland OR 97230",2.0,2,1973,Portland,Multnomah,Scott,Reynolds,Condo,0.0,H.B. Lee,unknown,229900,1638,97230,30,402397.25,Reynolds
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",2.0,2,1986,Portland,Multnomah,Margaret Scott,Reynolds,Condo,0.0,H.B. Lee,SUMMERPLACE,239000,1128,97230,30,402397.25,Reynolds


In [9]:
# Create a cost ranker based on high schools
hs = cleaned_housing_data3[["price","high_school"]]
hsAVG = hs.groupby(["high_school"]).mean().sort_values(by=["price"], ascending=False)
hsRanker = hsAVG.reset_index(drop=False)
hsRanker.reset_index(drop=False, inplace=True)
hsRanker.rename(columns={"index":"hs_rank","price":"hsAVGcost"}, inplace=True)
hsRanker["hs_rank"]= hsRanker["hs_rank"]+1

# Create a cost ranker based on districts
district = cleaned_housing_data3[["price","district"]]
districtAVG = district.groupby(["district"]).mean().sort_values(by=["price"], ascending=False)
districtRanker = districtAVG.reset_index(drop=False)
districtRanker.reset_index(drop=False, inplace=True)
districtRanker.rename(columns={"index":"district_rank","price":"districtAVGcost"}, inplace=True)
districtRanker["district_rank"]= districtRanker["district_rank"]+1

In [10]:
# Merge high school and district rankers 
cleaned_housing_data4 = pd.merge(cleaned_housing_data3, hsRanker, on="high_school")
cleaned_housing_data_5 = pd.merge(cleaned_housing_data4, districtRanker, on="district")
cleaned_housing_data_final = cleaned_housing_data_5[['address', 'price', 'home_type', 'bedrooms', 
                                'bathrooms', 'square_feet', 'built', 'lot_size', 'neighborhood', 
                                'county', 'city', 'zipcode', 'zipcode_rank', 'zipcodeAVGcost',
                                'elementary_school', 'middle_school', 'high_school','hs_rank', 
                                'hsAVGcost', 'district', 'district_rank', 'districtAVGcost']]

# Add age of home column
cleaned_housing_data_final["house_age"] = 2020 - cleaned_housing_data_final["built"]
cleaned_housing_data_final.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


address               object
price                  int64
home_type             object
bedrooms               int64
bathrooms            float64
square_feet            int64
built                  int64
lot_size             float64
neighborhood          object
county                object
city                  object
zipcode                int64
zipcode_rank           int64
zipcodeAVGcost       float64
elementary_school     object
middle_school         object
high_school           object
hs_rank                int64
hsAVGcost            float64
district              object
district_rank          int64
districtAVGcost      float64
house_age              int64
dtype: object

## Prepare Data for Model

In [11]:
# Make a copy of the original data frame to modify.
model_df = cleaned_housing_data_final

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms",
                            "bedrooms",
#                             "built",
                            "house_age",
                            "lot_size",
                            "square_feet",
#                             "neighborhood",
#                             "county",
#                             "home_type",
#                             "hs_rank",
#                             "hsAVGcost",
                            "districtAVGcost",
#                             "district_rank",
                            "zipcodeAVGcost",
#                             "zipcode_rank",
                            "price"]
                       ]

# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

3636


Unnamed: 0,bathrooms,bedrooms,house_age,lot_size,square_feet,districtAVGcost,zipcodeAVGcost,price
0,1.0,1,60,0.0,735,370601.141414,402397.25,129500
1,2.0,2,41,0.0,1073,370601.141414,402397.25,160000
2,2.0,3,75,0.0,1150,370601.141414,402397.25,224500
3,2.0,2,47,0.0,1638,370601.141414,402397.25,229900
4,2.0,2,34,0.0,1128,370601.141414,402397.25,239000


In [12]:
# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,house_age,lot_size,square_feet,districtAVGcost,zipcodeAVGcost,price_range
0,1.0,1,60,0.0,735,370601.141414,402397.25,"(123499.999, 349900.0]"
1,2.0,2,41,0.0,1073,370601.141414,402397.25,"(123499.999, 349900.0]"
2,2.0,3,75,0.0,1150,370601.141414,402397.25,"(123499.999, 349900.0]"
3,2.0,2,47,0.0,1638,370601.141414,402397.25,"(123499.999, 349900.0]"
4,2.0,2,34,0.0,1128,370601.141414,402397.25,"(123499.999, 349900.0]"


In [13]:
# # Get dummies for the values in home_type to use in the model.
# model_df = pd.get_dummies(model_df, columns=["home_type"])
# model_df.head()

In [14]:
# Assign X (input) and y (target).
X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [15]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
# Create a MinMaxScaler model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [17]:
# Transform the training and testing data using the X_scaler and y_scaler models.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)
print(y_train[0])
print(encoded_y_train[0])

KeyError: 0

In [19]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical[0]

array([1., 0., 0., 0., 0.], dtype=float32)

## Run Random Forest Classifier

In [20]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.605060506050605
[(0.3670031032877401, 'square_feet'), (0.16942931170611844, 'house_age'), (0.14942861192288678, 'zipcodeAVGcost'), (0.10931876745143906, 'lot_size'), (0.08624957164964715, 'bathrooms'), (0.06976861011697372, 'bedrooms'), (0.048802023865194596, 'districtAVGcost')]


## Create a Deep Learning Model

In [21]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=500, activation='relu', input_dim=7))
deep_model.add(Dense(units=200, activation='relu'))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [22]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=190,
    shuffle=True,
    verbose=2
)

Train on 2727 samples
Epoch 1/190
2727/2727 - 1s - loss: 1.3316 - accuracy: 0.4001
Epoch 2/190
2727/2727 - 0s - loss: 1.0346 - accuracy: 0.5277
Epoch 3/190
2727/2727 - 0s - loss: 0.9846 - accuracy: 0.5548
Epoch 4/190
2727/2727 - 0s - loss: 0.9599 - accuracy: 0.5695
Epoch 5/190
2727/2727 - 0s - loss: 0.9531 - accuracy: 0.5710
Epoch 6/190
2727/2727 - 0s - loss: 0.9313 - accuracy: 0.5834
Epoch 7/190
2727/2727 - 0s - loss: 0.9199 - accuracy: 0.5922
Epoch 8/190
2727/2727 - 0s - loss: 0.9170 - accuracy: 0.5908
Epoch 9/190
2727/2727 - 0s - loss: 0.9009 - accuracy: 0.5985
Epoch 10/190
2727/2727 - 0s - loss: 0.9024 - accuracy: 0.6025
Epoch 11/190
2727/2727 - 0s - loss: 0.9236 - accuracy: 0.5955
Epoch 12/190
2727/2727 - 0s - loss: 0.8887 - accuracy: 0.6135
Epoch 13/190
2727/2727 - 0s - loss: 0.9020 - accuracy: 0.6014
Epoch 14/190
2727/2727 - 0s - loss: 0.8840 - accuracy: 0.6120
Epoch 15/190
2727/2727 - 0s - loss: 0.8666 - accuracy: 0.6245
Epoch 16/190
2727/2727 - 0s - loss: 0.8669 - accuracy: 0.

Epoch 133/190
2727/2727 - 0s - loss: 0.6611 - accuracy: 0.7220
Epoch 134/190
2727/2727 - 0s - loss: 0.6583 - accuracy: 0.7173
Epoch 135/190
2727/2727 - 0s - loss: 0.6470 - accuracy: 0.7268
Epoch 136/190
2727/2727 - 0s - loss: 0.6545 - accuracy: 0.7198
Epoch 137/190
2727/2727 - 0s - loss: 0.6455 - accuracy: 0.7272
Epoch 138/190
2727/2727 - 0s - loss: 0.6456 - accuracy: 0.7253
Epoch 139/190
2727/2727 - 0s - loss: 0.6568 - accuracy: 0.7140
Epoch 140/190
2727/2727 - 0s - loss: 0.6368 - accuracy: 0.7272
Epoch 141/190
2727/2727 - 0s - loss: 0.6555 - accuracy: 0.7195
Epoch 142/190
2727/2727 - 0s - loss: 0.6484 - accuracy: 0.7217
Epoch 143/190
2727/2727 - 0s - loss: 0.6436 - accuracy: 0.7206
Epoch 144/190
2727/2727 - 0s - loss: 0.6297 - accuracy: 0.7323
Epoch 145/190
2727/2727 - 0s - loss: 0.6415 - accuracy: 0.7301
Epoch 146/190
2727/2727 - 0s - loss: 0.6269 - accuracy: 0.7341
Epoch 147/190
2727/2727 - 0s - loss: 0.6297 - accuracy: 0.7283
Epoch 148/190
2727/2727 - 0s - loss: 0.6343 - accuracy:

<tensorflow.python.keras.callbacks.History at 0x2656afcf688>

## Quantify our Trained Model

In [24]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

909/909 - 0s - loss: 0.9663 - accuracy: 0.6370
Loss: 0.9663174590392999, Accuracy: 0.6369637250900269


## Make Predictions

In [25]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [Interval(123499.999, 349900.0, closed='right')
 Interval(449000.0, 599000.0, closed='right')
 Interval(449000.0, 599000.0, closed='right')
 Interval(123499.999, 349900.0, closed='right')
 Interval(449000.0, 599000.0, closed='right')
 Interval(123499.999, 349900.0, closed='right')
 Interval(449000.0, 599000.0, closed='right')
 Interval(123499.999, 349900.0, closed='right')
 Interval(799000.0, 4495000.0, closed='right')
 Interval(599000.0, 799000.0, closed='right')]
Actual Labels: [Interval(349900.0, 449000.0, closed='right'), Interval(349900.0, 449000.0, closed='right'), Interval(449000.0, 599000.0, closed='right'), Interval(349900.0, 449000.0, closed='right'), Interval(449000.0, 599000.0, closed='right'), Interval(123499.999, 349900.0, closed='right'), Interval(449000.0, 599000.0, closed='right'), Interval(123499.999, 349900.0, closed='right'), Interval(799000.0, 4495000.0, closed='right'), Interval(599000.0, 799000.0, closed='right')]


## Save the trained model

In [26]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [27]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [28]:
# Input data for testing.
input_data = np.array(np.array([X.iloc[0]]))

In [29]:
X.iloc[0]

bathrooms               1.000000
bedrooms                1.000000
house_age              60.000000
lot_size                0.000000
square_feet           735.000000
districtAVGcost    370601.141414
zipcodeAVGcost     402397.250000
Name: 0, dtype: float64

In [30]:
encoded_predictions = model.predict_classes(scaler.transform(input_data))
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].left}, {prediction_labels[0].right}")

123499.999, 349900.0
