<a href="https://colab.research.google.com/github/EliasAguirre/housingprediction/blob/master/HousingMarketPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Tools

In [0]:
#tools to use
import pandas as pd
import numpy as np
import tensorflow as tf

# Curate Data

In [4]:
# read data (uploaded to colab space) and create df (matrix or table)
train_df = pd.read_csv('housing.csv')
train_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [0]:
# remove unecessary variables (i.e not influence price like address), axis 1 = column, axis 0 = row
new_train = train_df.drop(["Address", "SellerG", "Date", "CouncilArea", "Regionname", "Suburb", "Method"], axis=1)

In [6]:
new_train.head()

Unnamed: 0,Rooms,Type,Price,Postcode,Propertycount,Distance
0,3,h,1490000.0,3067,4019,3.0
1,3,h,1220000.0,3067,4019,3.0
2,3,h,1420000.0,3067,4019,3.0
3,3,h,1515000.0,3040,1543,7.5
4,2,h,670000.0,3042,3464,10.4


In [0]:
# make 'type' be a number, again computers only work with numbers not letters!
# get values to encode values of 'type'
features_to_encode = new_train['Type'].values

In [0]:
# can now drop from df because we dont need those letters
new_train = new_train.drop('Type', axis=1)

In [0]:
from sklearn.preprocessing import OneHotEncoder

In [0]:
# sklearns OneHotEncoder can do this since its a way of converting categorical variables into binary arrays
# oncehotencoder instead of levelbinarizer (values are numbers vs letters)
oh = OneHotEncoder()

In [0]:
# "This creates a binary column for each category and returns a sparse matrix or dense array."
# input is an array of the categorical values
# numpy reshape unknown row but colums as 1 
encoded_type_features = oh.fit_transform(features_to_encode.reshape(-1, 1))

In [12]:
# Check out the array and how it looks
encoded_type_features.toarray()

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [13]:
# See categories that were in our "Type" column
oh.categories_

[array(['h', 't', 'u'], dtype=object)]

In [0]:
# Create a new type of dataframe which will support this new datatype (3 types of rooms)
columns = ["Rooms", "Price", "Postcode", "Propertycount", "Distance", "Type1", "Type2", "Type3"]

In [15]:
# Now we transform into a matrix so we can add columns
# use .values because .as_matrix will be removed in future version
new_matrix = new_train.as_matrix()

  """Entry point for launching an IPython kernel.


In [16]:
# Notice we do not have the "Type" column, rather only the first 5 variables. 
# So we want to add it
new_matrix[0]

array([3.000e+00, 1.490e+06, 3.067e+03, 4.019e+03, 3.000e+00])

In [17]:
new_matrix.shape

(63023, 5)

In [0]:
# np.c_ is a function which adds columns to a matrix. We use it so we can add columns to our new_matrix
matrix_with_columns = np.c_[new_matrix, encoded_type_features.toarray()]

In [19]:
# check shape to make sure we have 3 more columns (i.e (x,8) )  
matrix_with_columns.shape

(63023, 8)

In [20]:
# variables for each house
matrix_with_columns[0]

array([3.000e+00, 1.490e+06, 3.067e+03, 4.019e+03, 3.000e+00, 1.000e+00,
       0.000e+00, 0.000e+00])

In [0]:
# scale - avoid algorithm from thinking that certain things ar emore important than others 
# because they have higher values (postal code more important than rooms because 
# 66167>4).
# lets scale accordingly and have them all on values between [0,1]

# another tool from sklearn
from sklearn.preprocessing import MinMaxScaler

In [0]:
# MinMaxScaler does this range and the parameter is what we set
scaler = MinMaxScaler(feature_range=(0,1))

In [0]:
# Turn our matrix into a dataframe again so we can use scaler
df = pd.DataFrame(matrix_with_columns)

In [24]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.0,1490000.0,3067.0,4019.0,3.0,1.0,0.0,0.0
1,3.0,1220000.0,3067.0,4019.0,3.0,1.0,0.0,0.0
2,3.0,1420000.0,3067.0,4019.0,3.0,1.0,0.0,0.0
3,3.0,1515000.0,3040.0,1543.0,7.5,1.0,0.0,0.0
4,2.0,670000.0,3042.0,3464.0,10.4,1.0,0.0,0.0


In [0]:
# one of the methods for the scaler
# fit to data then transform it 
scaled_train = scaler.fit_transform(df)

In [26]:
# to help visualize, one for each column
scaled_train[0]

array([0.06666667, 0.12640576, 0.06836735, 0.18416547, 0.04680187,
       1.        , 0.        , 0.        ])

In [0]:
# we have values in dataframe but want to label the columns to see what we're doing
# we created variable columns in cell 12
scaled_train_df = pd.DataFrame(scaled_train, columns=columns)

In [28]:
# visualize
# values have been scaled
scaled_train_df.head()

Unnamed: 0,Rooms,Price,Postcode,Propertycount,Distance,Type1,Type2,Type3
0,0.066667,0.126406,0.068367,0.184165,0.046802,1.0,0.0,0.0
1,0.066667,0.102114,0.068367,0.184165,0.046802,1.0,0.0,0.0
2,0.066667,0.120108,0.068367,0.184165,0.046802,1.0,0.0,0.0
3,0.066667,0.128655,0.040816,0.069594,0.117005,1.0,0.0,0.0
4,0.033333,0.052632,0.042857,0.158484,0.162246,1.0,0.0,0.0


In [0]:
# we have NaN values in our data which cant be seen using just the head function
# this will affect the learning of the algorithm if we dont do anything
# lets use a function that fills all the NaN values with the mean value of that column
# better but NOT optimal

scaled_train_df = scaled_train_df.fillna(scaled_train_df.mean())

In [0]:
# our training data should be everything except our to-be-predicted variable, price

# drop price from dataset, axis=1 is column
X = scaled_train_df.drop("Price", axis=1).values


# create dataset with just price variable (to-be-predicted)
Y = scaled_train_df[["Price"]].values

# Building the Model

In [0]:
# declaration of model
# many types of models, chose sequential
# which is a linear stack of layers
model = tf.keras.Sequential()

In [0]:
# now we want to not take into account if it doesnt meet certain magnitude to avoid 
# overfitting, numbers are hyperparameter fitting, 50 and 100s just out of convenience

# A layer encapsulates both a state (the layer's "weights") and a transformation 
# from inputs to outputs (a "call", the layer's forward pass).
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dense(1))

In [33]:
# configures model for training
# takes an optimizer (one of 2 things required to compile model)
# can declare parameter or just pass parameter by referencing is name
# the first one would look like 
# sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='mean_squared_error', optimizer=sgd)

# an optimizer is 
# here we call it by its name
# the 'adam' optimizer is an algorithm for first-order gradient-based 
# optimization of stochastic objective functions, based on adaptive estimates of lower-order moments.
model.compile(loss='mean_squared_error', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.


In [34]:
# now we will use everything from value [10] forward
# we will shuffle the good data
# select epochs which is the times we run it 
# select verbose

#model.fit "trains the model for a given number of epochs (iterations on a dataset)."
# param1 is a numpry array of training data (if single data like in this case)
    # otherwise it would be a list of Numpy arrays
#param2 numpy array of tager (label) data or list of numpy arrays
#epochs is an int and is the number of epochs to train model which is essentially
    # an iteration over the entire x and y data provided
#shuffle is a boolean and wether to shuffle training data before each epoch
#verbose (0=silent, 1=progress bar, 2= one line per epoch) to show data in prediction

model.fit(
    X[10:],
    Y[10:],
    epochs=50,
    shuffle=True,
    verbose=2
)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
 - 3s - loss: 0.0014
Epoch 2/50
 - 2s - loss: 0.0013
Epoch 3/50
 - 2s - loss: 0.0013
Epoch 4/50
 - 2s - loss: 0.0013
Epoch 5/50
 - 2s - loss: 0.0013
Epoch 6/50
 - 2s - loss: 0.0012
Epoch 7/50
 - 2s - loss: 0.0012
Epoch 8/50
 - 2s - loss: 0.0012
Epoch 9/50
 - 2s - loss: 0.0012
Epoch 10/50
 - 2s - loss: 0.0012
Epoch 11/50
 - 2s - loss: 0.0012
Epoch 12/50
 - 2s - loss: 0.0012
Epoch 13/50
 - 2s - loss: 0.0012
Epoch 14/50
 - 2s - loss: 0.0012
Epoch 15/50
 - 3s - loss: 0.0012
Epoch 16/50
 - 3s - loss: 0.0012
Epoch 17/50
 - 3s - loss: 0.0012
Epoch 18/50
 - 3s - loss: 0.0012
Epoch 19/50
 - 2s - loss: 0.0012
Epoch 20/50
 - 2s - loss: 0.0012
Epoch 21/50
 - 2s - loss: 0.0012
Epoch 22/50
 - 2s - loss: 0.0012
Epoch 23/50
 - 2s - loss: 0.0012
Epoch 24/50
 - 2s - loss: 0.0012
Epoch 25/50
 - 2s - loss: 0.0012
Epoch 26/50
 - 2s - loss: 0.0012
Epoch 27/50
 - 2s - loss: 0.0012
Epoch 28/50
 - 2s - lo

<tensorflow.python.keras.callbacks.History at 0x7f2b2d4256d8>

## Predictions

In [0]:
# keras function 
# Get the 1 column we have which is price, and let keras take care
# of how many by leaving row blank
prediction = model.predict(X[:1])

In [36]:
prediction

array([[0.09369746]], dtype=float32)

In [0]:
multiplier = scaler.scale_[1]
adder = scaler.min_[1]

In [38]:
#scale back
pred = prediction[0][0]
print('Prediction with scaling = ',format(pred))
pred -= adder
pred /= multiplier
print("Housing Price Prediction - ${}".format(pred))

Prediction with scaling =  0.0936974585056305
Housing Price Prediction - $1126447.251290083
