## Setup

In [1]:
# Import Dependencies.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import requests
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from joblib import dump, load

In [2]:
# Fetch the data from the API.
listings_json = requests.get("http://127.0.0.1:5000/housingDataAPI/v1.0/listings").json()

# Examine the data.
print(json.dumps(listings_json[0], indent=4, sort_keys=True))

{
    "address": "17452 NE GLISAN ST #7, Portland OR 97230",
    "bathrooms": 2.0,
    "bedrooms": 2,
    "built": 1988,
    "city": "Portland",
    "county": "Multnomah",
    "elementary_school": "Hartley",
    "high_school": "Reynolds",
    "home_type": "Manufactured - Double Wide Manufact",
    "lot_size": null,
    "middle_school": "Reynolds",
    "neighborhood": "unknown",
    "price": 72000,
    "square_feet": 1152,
    "zipcode": 97230
}


In [3]:
# Create a dataframe to use for our model.
data_df = pd.DataFrame(listings_json)

print(len(data_df))
data_df.head()

2056


Unnamed: 0,address,bathrooms,bedrooms,built,city,county,elementary_school,high_school,home_type,lot_size,middle_school,neighborhood,price,square_feet,zipcode
0,"17452 NE GLISAN ST #7, Portland OR 97230",2.0,2,1988,Portland,Multnomah,Hartley,Reynolds,Manufactured - Double Wide Manufact,,Reynolds,unknown,72000,1152,97230
1,"16000 SE POWELL BLVD 75, Portland OR 97236",2.0,3,1990,Portland,Multnomah,Powell Butte,Centennial,Manufactured - Double Wide Manufact,,Centennial,unknown,79950,1404,97236
2,"12846 SE RAMONA ST 6, Portland OR 97236",2.0,3,1997,Portland,Multnomah,Gilbert Hts,David Douglas,Manufactured - Double Wide Manufact,,Alice Ott,unknown,93900,1297,97236
3,"7720 S Macadam AVE 7, Portland OR 97219",3.0,3,1988,Portland,Multnomah,Other,Other,Floating Home - Contemporary,,Other,unknown,125000,2432,97219
4,"19609 NE Marine DR E-4, Portland OR 97230",1.0,1,1960,Portland,Multnomah,Salish Pond,Reynolds,Floating Home - Cabin,,Reynolds,unknown,129500,735,97230


# Data Preprocessing

In [4]:
# Make a copy of the original data frame to modify.
model_df = data_df

# Insert a lot value of 0 for condos and floating homes.
for index, row in model_df.iterrows():
    if ("Condo" in row["home_type"]) | ("Floating" in row["home_type"]):
        model_df.loc[index, "lot_size"] = 0
    else:
        pass

# Include only those columns that will be used in the deep learning model.
model_df = model_df.loc[:, ["bathrooms",
                            "bedrooms",
                            "built",
                            "lot_size",
                            "square_feet",
                            "home_type",
                            "high_school",
                            "zipcode",
                            "price"]
                       ]

# Drop rows with NaN entries.
model_df.dropna(inplace=True)

# Check the model data.
print(len(model_df))
model_df.head()

1947


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,high_school,zipcode,price
3,3.0,3,1988,0.0,2432,Floating Home - Contemporary,Other,97219,125000
4,1.0,1,1960,0.0,735,Floating Home - Cabin,Reynolds,97230,129500
5,1.0,1,1974,0.0,720,Condo - Traditional,David Douglas,97236,141900
6,1.0,1,1927,0.0,382,Condo - Common Wall,Lincoln,97209,144900
7,1.0,1,2004,0.0,513,Condo - Other,David Douglas,97220,149900


In [5]:
# Simplify home types in model_df.
for i in model_df.index:
    if "Floating" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Floating"
    if "Condo" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Condo"
    if "Single Family" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Single Family"
    if "Manufactured" in model_df.at[i, "home_type"]:
        model_df.at[i, "home_type"] = "Manufactured"
    
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,high_school,zipcode,price
3,3.0,3,1988,0.0,2432,Floating,Other,97219,125000
4,1.0,1,1960,0.0,735,Floating,Reynolds,97230,129500
5,1.0,1,1974,0.0,720,Condo,David Douglas,97236,141900
6,1.0,1,1927,0.0,382,Condo,Lincoln,97209,144900
7,1.0,1,2004,0.0,513,Condo,David Douglas,97220,149900


In [6]:
# Create district df.
school_dict = ({"high_school" : ['Reynolds', 'Parkrose', 'David Douglas', 'Centennial', 'Cleveland',
        'Lincoln', 'Madison', 'Jefferson', 'Roosevelt', 'Sunset','Westview', 'Liberty', 'Beaverton', 
        'Grant', 'Southridge', 'Tigard', 'Wilson', 'Riverdale', 'Lake Oswego', 'Franklin',
        'Tualatin', 'Milwaukie', 'Scappoose'], "district" : ['Reynolds', 'Parkrose','David Douglas',
        'Centennial', 'Portland Public', 'Portland Public', 'Portland Public', 'Portland Public',
        'Portland Public', 'Beaverton', 'Beaverton', 'Hillsboro', 'Beaverton', 'Portland Public',
        'Beaverton', 'Tigard-Tualatin', 'Portland Public', 'Riverdale', 'Lake Oswego', 'Portland Public',
        'Tigard-Tualatin', 'North Clackamas', 'Scappose']})
district_df = pd.DataFrame(school_dict)

# Merge into model_df.
model_df = pd.merge(model_df, district_df, on="high_school")

# Drop the high_school column.
model_df.drop("high_school", axis=1, inplace=True)
print(len(model_df))
model_df.head()

1943


Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,zipcode,price,district
0,1.0,1,1960,0.0,735,Floating,97230,129500,Reynolds
1,2.0,2,1979,0.0,1073,Condo,97230,160000,Reynolds
2,3.0,2,2006,0.0,1020,Condo,97233,211000,Reynolds
3,2.0,3,1945,0.0,1150,Floating,97230,224500,Reynolds
4,2.0,2,1973,0.0,1638,Condo,97230,229900,Reynolds


In [7]:
# # Rank the home_types in order of mean home price.
# home_type = model_df[["price","home_type"]]
# home_typeAVG = home_type.groupby(["home_type"]).mean().sort_values(by=["price"], ascending=False)
# home_typeRanker = home_typeAVG.reset_index(drop=False)

# # Create a dictionary to rank the zipcode for a particular listing.
# home_type_ranker_dict = {}
# for index, row in home_typeRanker.iterrows():
#     home_type_ranker_dict[row["home_type"]] = index
# home_type_ranker_dict

In [8]:
# # Create a home_type ranking for each listing.
# model_df["home_type_rank"] = [home_type_ranker_dict[home_type] for home_type in model_df["home_type"]]
# Drop the home_type for each listing.
# model_df.drop("home_type", axis=1, inplace=True)
# model_df.head()

In [9]:
# # Rank the districts in order of mean home price.
# district = model_df[["price","district"]]
# districtAVG = district.groupby(["district"]).mean().sort_values(by=["price"], ascending=False)
# districtRanker = districtAVG.reset_index(drop=False)

# # Create a dictionary to rank the district for a particular listing.
# district_ranker_dict = {}
# for index, row in districtRanker.iterrows():
#     district_ranker_dict[row["district"]] = index
# district_ranker_dict

In [10]:
# # Create a district ranking for each listing.
# model_df["district_rank"] = [district_ranker_dict[district] for district in model_df["district"]]
# # Drop the district for each listing.
# model_df.drop("district", axis=1, inplace=True)
# model_df.head()

In [11]:
# # Rank the zipcodes in order of mean home price.
# zipcode = model_df[["price","zipcode"]]
# zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
# zipcodeRanker = zipcodeAVG.reset_index(drop=False)

# # Create a dictionary to rank the zipcode for a particular listing.
# zipcode_ranker_dict = {}
# for index, row in zipcodeRanker.iterrows():
#     zipcode_ranker_dict[int(row["zipcode"])] = index
# zipcode_ranker_dict

In [12]:
# # Create a zipcode ranking for each listing.
# model_df["zipcode_rank"] = [zipcode_ranker_dict[zipcode] for zipcode in model_df["zipcode"]]
# Drop the zipcode for each listing.
# model_df.drop("zipcode", axis=1, inplace=True)
# model_df.head()

In [13]:
# Bin prices into ten equal length ranges.
model_df["price_range"] = pd.qcut(model_df["price"], 5)
# Drop the original price data.
model_df.drop("price", axis=1, inplace=True)
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,home_type,zipcode,district,price_range
0,1.0,1,1960,0.0,735,Floating,97230,Reynolds,"(123499.999, 348340.0]"
1,2.0,2,1979,0.0,1073,Condo,97230,Reynolds,"(123499.999, 348340.0]"
2,3.0,2,2006,0.0,1020,Condo,97233,Reynolds,"(123499.999, 348340.0]"
3,2.0,3,1945,0.0,1150,Floating,97230,Reynolds,"(123499.999, 348340.0]"
4,2.0,2,1973,0.0,1638,Condo,97230,Reynolds,"(123499.999, 348340.0]"


In [14]:
# Get dummies for the values in home_type to use in the model.
model_df = pd.get_dummies(model_df, columns=["home_type","district","zipcode"])
model_df.head()

Unnamed: 0,bathrooms,bedrooms,built,lot_size,square_feet,price_range,home_type_Condo,home_type_Floating,home_type_Manufactured,home_type_Single Family,...,zipcode_97225,zipcode_97227,zipcode_97229,zipcode_97230,zipcode_97231,zipcode_97232,zipcode_97233,zipcode_97236,zipcode_97239,zipcode_97266
0,1.0,1,1960,0.0,735,"(123499.999, 348340.0]",0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2.0,2,1979,0.0,1073,"(123499.999, 348340.0]",1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3.0,2,2006,0.0,1020,"(123499.999, 348340.0]",1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2.0,3,1945,0.0,1150,"(123499.999, 348340.0]",0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,2.0,2,1973,0.0,1638,"(123499.999, 348340.0]",1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
# Assign X (input) and y (target).

X = model_df.drop("price_range", axis=1)
y = model_df["price_range"]

In [16]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
# Create a MinMaxScaler model and fit it to the training data

X_scaler = MinMaxScaler().fit(X_train)

# Save the scalar.
dump(X_scaler, 'minmax_scaler.bin', compress=True)

['minmax_scaler.bin']

In [18]:
# Transform the training and testing data using the X_scaler and y_scaler models.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Label encode the target data.
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Save the label encoder
dump(label_encoder, 'label_encoder.bin', compress=True)

['label_encoder.bin']

In [20]:
# Convert encoded labels to one-hot encoding.
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Run Random Forest Classifier

In [21]:
# Create a random forest classifier, fit to the training data, and score on the testing data.
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train_scaled, y_train_categorical)
print(rf.score(X_test_scaled, y_test_categorical))

# Find the importances of each feature.
feature_names = X.columns
importances = rf.feature_importances_
print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))

0.5761316872427984
[(0.3039031300131124, 'square_feet'), (0.1502461492277107, 'built'), (0.09013132725169214, 'bathrooms'), (0.08942538655509241, 'lot_size'), (0.07423816081124927, 'bedrooms'), (0.020151117325278307, 'district_Portland Public'), (0.016654539237920558, 'zipcode_97209'), (0.013453692412634733, 'home_type_Single Family'), (0.013447367368685476, 'home_type_Condo'), (0.013424733999524888, 'zipcode_97266'), (0.012452385192592777, 'zipcode_97217'), (0.01220417296667141, 'zipcode_97229'), (0.01117291836353492, 'zipcode_97206'), (0.011092732493363256, 'zipcode_97219'), (0.010220321800881463, 'zipcode_97211'), (0.010070429074895216, 'district_Beaverton'), (0.00991977806692704, 'zipcode_97202'), (0.009229361269160716, 'district_David Douglas'), (0.009229282265180482, 'zipcode_97201'), (0.0086838283343154, 'zipcode_97239'), (0.007821109349351542, 'zipcode_97210'), (0.0067940587391437865, 'zipcode_97203'), (0.006347414933143469, 'zipcode_97213'), (0.006234965066157125, 'zipcode_972

## Create a Deep Learning Model

In [22]:
# Create a deep learning Sequential model.
deep_model = Sequential()
deep_model.add(Dense(units=100, activation='relu', input_dim=54))
deep_model.add(Dense(units=100, activation='relu'))
deep_model.add(Dense(units=5, activation='softmax'))

In [23]:
# Compile and fit the model.
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 1457 samples
Epoch 1/100
1457/1457 - 1s - loss: 1.5528 - accuracy: 0.2896
Epoch 2/100
1457/1457 - 0s - loss: 1.3527 - accuracy: 0.4180
Epoch 3/100
1457/1457 - 0s - loss: 1.1434 - accuracy: 0.5244
Epoch 4/100
1457/1457 - 0s - loss: 1.0061 - accuracy: 0.5806
Epoch 5/100
1457/1457 - 0s - loss: 0.9294 - accuracy: 0.6170
Epoch 6/100
1457/1457 - 0s - loss: 0.8729 - accuracy: 0.6404
Epoch 7/100
1457/1457 - 0s - loss: 0.8524 - accuracy: 0.6342
Epoch 8/100
1457/1457 - 0s - loss: 0.8236 - accuracy: 0.6706
Epoch 9/100
1457/1457 - 0s - loss: 0.8090 - accuracy: 0.6719
Epoch 10/100
1457/1457 - 0s - loss: 0.7960 - accuracy: 0.6644
Epoch 11/100
1457/1457 - 0s - loss: 0.7857 - accuracy: 0.6658
Epoch 12/100
1457/1457 - 0s - loss: 0.7741 - accuracy: 0.6795
Epoch 13/100
1457/1457 - 0s - loss: 0.7677 - accuracy: 0.6822
Epoch 14/100
1457/1457 - 0s - loss: 0.7582 - accuracy: 0.6802
Epoch 15/100
1457/1457 - 0s - loss: 0.7580 - accuracy: 0.6747
Epoch 16/100
1457/1457 - 0s - loss: 0.7358 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x1a50817d10>

## Quantify our Trained Model

In [24]:
model_loss, model_accuracy = deep_model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

486/1 - 0s - loss: 1.0845 - accuracy: 0.6584
Loss: 0.9257818293669586, Accuracy: 0.6584362387657166


## Make Predictions

In [25]:
# Use the first 10 test data values to make a prediction and compare it to the actual labels.
encoded_predictions = deep_model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [Interval(348340.0, 449000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(348340.0, 449000.0, closed='right')
 Interval(348340.0, 449000.0, closed='right')
 Interval(825000.0, 4495000.0, closed='right')
 Interval(609000.0, 825000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')
 Interval(825000.0, 4495000.0, closed='right')
 Interval(449000.0, 609000.0, closed='right')]
Actual Labels: [Interval(123499.999, 348340.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(348340.0, 449000.0, closed='right'), Interval(123499.999, 348340.0, closed='right'), Interval(449000.0, 609000.0, closed='right'), Interval(825000.0, 4495000.0, closed='right'), Interval(609000.0, 825000.0, closed='right'), Interval(825000.0, 4495000.0, closed='right'), Interval(825000.0, 4495000.0, closed='right'), Interval(449000.0, 609000.0, closed='right')]


## Save the trained model

In [26]:
# Save the model
deep_model.save("housing_model_trained.h5")

## Test the saved model, scaler, and label encoder

In [27]:
# Load the model, scaler and label encoder.
model = load_model("housing_model_trained.h5")
scaler = load("minmax_scaler.bin")
label_encoder = load("label_encoder.bin")

In [28]:
# Input data for testing.
input_data = np.array(np.array([X.iloc[0]]))

In [29]:
X.iloc[0]

bathrooms                      1.0
bedrooms                       1.0
built                       1960.0
lot_size                       0.0
square_feet                  735.0
home_type_Condo                0.0
home_type_Floating             1.0
home_type_Manufactured         0.0
home_type_Single Family        0.0
district_Beaverton             0.0
district_Centennial            0.0
district_David Douglas         0.0
district_Hillsboro             0.0
district_Lake Oswego           0.0
district_North Clackamas       0.0
district_Parkrose              0.0
district_Portland Public       0.0
district_Reynolds              1.0
district_Riverdale             0.0
district_Scappose              0.0
district_Tigard-Tualatin       0.0
zipcode_97035                  0.0
zipcode_97201                  0.0
zipcode_97202                  0.0
zipcode_97203                  0.0
zipcode_97204                  0.0
zipcode_97205                  0.0
zipcode_97206                  0.0
zipcode_97209       

In [30]:
encoded_predictions = model.predict_classes(scaler.transform(input_data))
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f"{prediction_labels[0].left}, {prediction_labels[0].right}")

123499.999, 348340.0
