<img src="https://news.illinois.edu/files/6367/543635/116641.jpg" alt="University of Illinois" width="250"/>

## HW: Deep Learning ##

HW submission by group (up to 4 people)
* John Doe <johndoe@illinois.edu>
* Jane Roes <janeroe@illinois.edu>

**Redfin Price Prediction**:  Download property data from Redfin <https://www.redfin.com/> for several neighborhoods of Chicago.  Use multilayer neural networks to predict price based upon the feature set
* Square Feet
* Property Type
* number of Beds
* number of Baths
* Year built
* HOA/Month

In [1]:
import torch
import numpy
import pandas as pd
import random
import matplotlib
#%matplotlib notebook
import matplotlib.pyplot as plt
import scipy.stats
#from pandas.plotting import autocorrelation_plot
import matplotlib.offsetbox as offsetbox
from matplotlib.ticker import StrMethodFormatter
import itertools
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def getfile(location_pair,**kwargs): #tries to get local version and then defaults to google drive version
    (loc,gdrive)=location_pair
    try:
        out=pd.read_csv(loc,**kwargs)
    except FileNotFoundError:
        print("local file not found; accessing Google Drive")
        loc = 'https://drive.google.com/uc?export=download&id='+gdrive.split('/')[-2]
        out=pd.read_csv(loc,**kwargs)
    return out

In [3]:
url="https://www.redfin.com"
fname=("redfin_data.csv","https://drive.google.com/file/d/1BFgKwV58YkPX_PRWMuKRHQoT6T0de_Qf/view?usp=sharing")
plot_title="Home Asking Price (Redfin)"
data_color="red"
markersize=2
thinlinesize=2
SEED=0

In [4]:
data_raw=getfile(fname)
data_raw

data = data_raw.copy()
data["SQUARE FEET/1000"]=data["SQUARE FEET"]/1000
data["PRICE/$1M"]=data["PRICE"]/1.0E6
data = data[['SALE TYPE', 'SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY',
       'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'PRICE/$1M', 'BEDS', 'BATHS',
       'LOCATION', 'SQUARE FEET/1000', 'LOT SIZE', 'YEAR BUILT', 'DAYS ON MARKET',
       '$/SQUARE FEET', 'HOA/MONTH', 'STATUS', 'NEXT OPEN HOUSE START TIME',
       'NEXT OPEN HOUSE END TIME',
       'URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)',
       'SOURCE', 'MLS#', 'FAVORITE', 'INTERESTED', 'LATITUDE', 'LONGITUDE']]

local file not found; accessing Google Drive


In [5]:
#Convert PROPERTY TYPE
df_encoded = pd.get_dummies(data, columns=['PROPERTY TYPE'])

df_encoded = df_encoded.select_dtypes(exclude=['object'])
columns_to_remove = ['SOLD DATE', 'ZIP OR POSTAL CODE', 'LOT SIZE', 'DAYS ON MARKET', '$/SQUARE FEET', 'MLS#', 'LATITUDE', 'LONGITUDE']
df_encoded = df_encoded.drop(columns=columns_to_remove)
df_encoded = df_encoded.fillna(df_encoded.mean())

df_encoded.head()

Unnamed: 0,PRICE/$1M,BEDS,BATHS,SQUARE FEET/1000,YEAR BUILT,HOA/MONTH,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Multi-Family (2-4 Unit),PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse,PROPERTY TYPE_Vacant Land
0,0.3399,3.0,2.0,1.52,1958.0,749.0,0,0,1,0,0
1,0.199,2.0,1.0,0.9,1959.0,452.0,1,0,0,0,0
2,1.075,5.0,3.5,3.135,2023.0,749.0,0,0,1,0,0
3,0.825,4.0,2.5,3.219,2019.0,749.0,0,0,0,1,0
4,0.525,3.0,1.5,1.6,1949.0,749.0,0,0,1,0,0


# Method 1

In [10]:
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, layer_size1=64, layer_size2=32):
        super(FeedForwardNN, self).__init__()

        self.fc1 = nn.Linear(input_size, layer_size1)

        self.fc2 = nn.Linear(layer_size1, layer_size2)

        self.fc3 = nn.Linear(layer_size2, 1)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.relu(self.fc3(x))

input_size = 10  # 10 predictors after turning Property Type into 5 predictors
model = FeedForwardNN(input_size)

print(model)

FeedForwardNN(
  (fc1): Linear(in_features=10, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)


In [11]:
X_tensor = torch.FloatTensor(df_encoded.drop('PRICE/$1M', axis=1).values)
y_tensor = torch.FloatTensor(df_encoded['PRICE/$1M'].values).view(-1, 1)

In [12]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Experimented with learning rate

epochs = 100
for epoch in range(epochs):
    outputs = model(X_tensor)
    loss = loss_function(outputs, y_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 1.3361
Epoch [20/100], Loss: 1.3361
Epoch [30/100], Loss: 1.3361
Epoch [40/100], Loss: 1.3361
Epoch [50/100], Loss: 1.3361
Epoch [60/100], Loss: 1.3361
Epoch [70/100], Loss: 1.3361
Epoch [80/100], Loss: 1.3361
Epoch [90/100], Loss: 1.3361
Epoch [100/100], Loss: 1.3361


In [13]:
X_tensor_20 = X_tensor[:20]  # Testing model performance on first 20 rows

In [14]:
model.eval()
with torch.no_grad():
    predictions_20 = model(X_tensor_20)

In [15]:
predicted_prices = predictions_20.numpy()
for i, price in enumerate(predicted_prices):
    print(f"Row {i + 1}: Predicted Price = {price[0]:.2f}")

Row 1: Predicted Price = 0.00
Row 2: Predicted Price = 0.00
Row 3: Predicted Price = 0.00
Row 4: Predicted Price = 0.00
Row 5: Predicted Price = 0.00
Row 6: Predicted Price = 0.00
Row 7: Predicted Price = 0.00
Row 8: Predicted Price = 0.00
Row 9: Predicted Price = 0.00
Row 10: Predicted Price = 0.00
Row 11: Predicted Price = 0.00
Row 12: Predicted Price = 0.00
Row 13: Predicted Price = 0.00
Row 14: Predicted Price = 0.00
Row 15: Predicted Price = 0.00
Row 16: Predicted Price = 0.00
Row 17: Predicted Price = 0.00
Row 18: Predicted Price = 0.00
Row 19: Predicted Price = 0.00
Row 20: Predicted Price = 0.00


In [16]:
actual_prices_20 = y_tensor[:20].numpy()
predicted_prices = predicted_prices.reshape(-1)
actual_prices_20 = actual_prices_20.reshape(-1)

In [17]:
comparison_df = pd.DataFrame({
    'Actual Price': actual_prices_20,
    'Predicted Price': predicted_prices
})

In [18]:
mae = numpy.mean(numpy.abs(predicted_prices - actual_prices_20))
normalized_mae = mae / numpy.mean(actual_prices_20)
mean_relative_error = numpy.mean(numpy.abs(predicted_prices - actual_prices_20)/actual_prices_20)
print(comparison_df)
print(mae)
print(normalized_mae)
print(mean_relative_error)

    Actual Price  Predicted Price
0       0.339900              0.0
1       0.199000              0.0
2       1.075000              0.0
3       0.825000              0.0
4       0.525000              0.0
5       0.450000              0.0
6       0.525000              0.0
7       1.350000              0.0
8       0.785000              0.0
9       0.859500              0.0
10      1.285000              0.0
11      0.850000              0.0
12      0.195000              0.0
13      0.315000              0.0
14      0.899000              0.0
15      0.230000              0.0
16      0.299999              0.0
17      0.340000              0.0
18      0.350000              0.0
19      1.999999              0.0
0.68486995
1.0
1.0


# Method 2

In [8]:
X_tensor = torch.FloatTensor(df_encoded.drop('PRICE/$1M', axis=1).values)
y_tensor = torch.FloatTensor(df_encoded['PRICE/$1M'].values).view(-1, 1)

In [13]:
#Making Dataset Iterable
batch_size = 50
n_iters = 1000
num_epochs = n_iters / (len(df_encoded) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.TensorDataset(*[X_tensor,y_tensor])

test_loader = torch.utils.data.DataLoader(train_loader, 
                                          batch_size=batch_size, 
                                          shuffle=False)


In [14]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function 1: 10*10 --> 100
        self.fc1 = nn.Linear(input_dim, hidden_dim) 
        # Non-linearity 1
        self.relu1 = nn.ReLU()

        # Linear function 2: 100 --> 100
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        # Non-linearity 2
        self.relu2 = nn.ReLU()

        # Linear function 3: 100 --> 100
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        # Non-linearity 3
        self.relu3 = nn.ReLU()

        # Linear function 4 (readout): 100 --> 1
        self.fc4 = nn.Linear(hidden_dim, output_dim)  

    def forward(self, x):
        # Linear function 1
        out = self.fc1(x)
        # Non-linearity 1
        out = self.relu1(out)

        # Linear function 2
        out = self.fc2(out)
        # Non-linearity 2
        out = self.relu2(out)

        # Linear function 2
        out = self.fc3(out)
        # Non-linearity 2
        out = self.relu3(out)

        # Linear function 4 (readout)
        out = self.fc4(out)
        return out

In [15]:
input_dim = 10
hidden_dim = 100
output_dim = 1

model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)

In [16]:
criterion = nn.CrossEntropyLoss()

In [17]:
learning_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [18]:
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images with gradient accumulation capabilities
        images = images.view(-1, 10).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Load images with gradient accumulation capabilities
                images = images.view(-1, 28*28).requires_grad_()

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

RuntimeError: expected scalar type Long but found Float

In [19]:
X_tensor_20 = X_tensor[:20]

In [20]:
model.eval()
with torch.no_grad():
    predictions_20 = model(X_tensor_20)

In [21]:
predicted_prices = predictions_20.numpy()
for i, price in enumerate(predicted_prices):
    print(f"Row {i + 1}: Predicted Price = {price[0]:.2f}")

Row 1: Predicted Price = -14.44
Row 2: Predicted Price = -12.56
Row 3: Predicted Price = -14.82
Row 4: Predicted Price = -14.78
Row 5: Predicted Price = -14.38
Row 6: Predicted Price = -14.49
Row 7: Predicted Price = -12.13
Row 8: Predicted Price = -14.48
Row 9: Predicted Price = -14.27
Row 10: Predicted Price = -14.48
Row 11: Predicted Price = -14.31
Row 12: Predicted Price = -14.26
Row 13: Predicted Price = -12.34
Row 14: Predicted Price = -14.47
Row 15: Predicted Price = -14.19
Row 16: Predicted Price = -12.32
Row 17: Predicted Price = -13.73
Row 18: Predicted Price = -14.07
Row 19: Predicted Price = -12.82
Row 20: Predicted Price = -33.85


# Method 3

In [29]:
data = data_raw.copy()

data["SQUARE FEET/1000"]=data["SQUARE FEET"]/1000
data["PRICE/$1M"]=data["PRICE"]/1.0E6
data = data[["PRICE", "SQUARE FEET/1000", "PROPERTY TYPE", "BEDS", "BATHS", "YEAR BUILT", "HOA/MONTH"]]

In [30]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [31]:
data.head()

Unnamed: 0,PRICE,SQUARE FEET/1000,PROPERTY TYPE,BEDS,BATHS,YEAR BUILT,HOA/MONTH
0,339900,1.52,Single Family Residential,3.0,2.0,1958.0,
1,199000,0.9,Condo/Co-op,2.0,1.0,1959.0,452.0
2,1075000,3.135,Single Family Residential,5.0,3.5,2023.0,
3,825000,3.219,Townhouse,4.0,2.5,2019.0,
4,525000,1.6,Single Family Residential,3.0,1.5,1949.0,


# Preprocessing and building model

In [32]:
df_preprocessed = data.copy()

minmax_scaler_label = MinMaxScaler()
minmax_scaler_features = MinMaxScaler()
label = "PRICE"
numerical_vars = ["SQUARE FEET/1000", "BEDS", "BATHS", "YEAR BUILT", "HOA/MONTH"]
df_preprocessed[numerical_vars] = df_preprocessed[numerical_vars].fillna(df_preprocessed[numerical_vars].mean())
df_preprocessed[numerical_vars] = minmax_scaler_features.fit_transform(df_preprocessed[numerical_vars])

df_preprocessed[label] = minmax_scaler_label.fit_transform(df_preprocessed[label].values.reshape(-1,1))


#Convert PROPERTY TYPE as a set of binary variables for each modality (One-Hot Encoding)
df_preprocessed = pd.get_dummies(df_preprocessed, columns=['PROPERTY TYPE'])
df_preprocessed.head()


Unnamed: 0,PRICE,SQUARE FEET/1000,BEDS,BATHS,YEAR BUILT,HOA/MONTH,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Multi-Family (2-4 Unit),PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse,PROPERTY TYPE_Vacant Land
0,0.04449,0.088436,0.285714,0.181818,0.591195,0.153929,0,0,1,0,0
1,0.019604,0.028838,0.142857,0.0,0.597484,0.074283,1,0,0,0,0
2,0.17432,0.24368,0.571429,0.454545,1.0,0.153929,0,0,1,0,0
3,0.130166,0.251754,0.428571,0.272727,0.974843,0.153929,0,0,0,1,0
4,0.077181,0.096126,0.285714,0.090909,0.534591,0.153929,0,0,1,0,0


In [33]:
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, layer_size1=64, layer_size2=32):
        super(FeedForwardNN, self).__init__()

        self.fc1 = nn.Linear(input_size, layer_size1)

        self.fc2 = nn.Linear(layer_size1, layer_size2)

        self.fc3 = nn.Linear(layer_size2, 1)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.relu(self.fc3(x))

input_size = 10  # 10 predictors after turning Property Type into 5 predictors
model = FeedForwardNN(input_size)

print(model)

FeedForwardNN(
  (fc1): Linear(in_features=10, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)


In [34]:
X_tensor = torch.FloatTensor(df_preprocessed.drop(label, axis=1).values)
y_tensor = torch.FloatTensor(df_preprocessed[label].values).view(-1, 1)

In [35]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Experimented with learning rate

epochs = 10000
for epoch in range(epochs):
    outputs = model(X_tensor)
    loss = loss_function(outputs, y_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 500 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [500/10000], Loss: 0.0026
Epoch [1000/10000], Loss: 0.0024
Epoch [1500/10000], Loss: 0.0024
Epoch [2000/10000], Loss: 0.0024
Epoch [2500/10000], Loss: 0.0024
Epoch [3000/10000], Loss: 0.0023
Epoch [3500/10000], Loss: 0.0025
Epoch [4000/10000], Loss: 0.0024
Epoch [4500/10000], Loss: 0.0024
Epoch [5000/10000], Loss: 0.0024
Epoch [5500/10000], Loss: 0.0024
Epoch [6000/10000], Loss: 0.0024
Epoch [6500/10000], Loss: 0.0024
Epoch [7000/10000], Loss: 0.0024
Epoch [7500/10000], Loss: 0.0024
Epoch [8000/10000], Loss: 0.0024
Epoch [8500/10000], Loss: 0.0024
Epoch [9000/10000], Loss: 0.0024
Epoch [9500/10000], Loss: 0.0024
Epoch [10000/10000], Loss: 0.0024


# Testing performances

In [36]:
model.eval()
with torch.no_grad():
    predictions = model(X_tensor)

In [37]:
predicted_prices = predictions.numpy()
for i, price in enumerate(predicted_prices[:20]):
    print(f"Row {i + 1}: Predicted Price = {price[0]:.2f}")

Row 1: Predicted Price = 0.06
Row 2: Predicted Price = 0.02
Row 3: Predicted Price = 0.18
Row 4: Predicted Price = 0.13
Row 5: Predicted Price = 0.08
Row 6: Predicted Price = 0.06
Row 7: Predicted Price = 0.08
Row 8: Predicted Price = 0.22
Row 9: Predicted Price = 0.12
Row 10: Predicted Price = 0.14
Row 11: Predicted Price = 0.21
Row 12: Predicted Price = 0.13
Row 13: Predicted Price = 0.02
Row 14: Predicted Price = 0.04
Row 15: Predicted Price = 0.14
Row 16: Predicted Price = 0.03
Row 17: Predicted Price = 0.04
Row 18: Predicted Price = 0.04
Row 19: Predicted Price = 0.05
Row 20: Predicted Price = 0.34


In [38]:
df_preprocessed["Predicted_price"] = predicted_prices

In [39]:
df_results = df_preprocessed[["PRICE", "Predicted_price"]]
df_results = pd.DataFrame(minmax_scaler_label.inverse_transform(df_results))
df_results.columns = ["PRICE", "Predicted_price"]
df_results

Unnamed: 0,PRICE,Predicted_price
0,339900.0,4.193407e+05
1,199000.0,2.037115e+05
2,1075000.0,1.082220e+06
3,825000.0,8.254069e+05
4,525000.0,5.242911e+05
...,...,...
103,350000.0,8.841541e+05
104,344900.0,3.512430e+05
105,5750000.0,5.775078e+06
106,110000.0,8.841541e+05


In [40]:
from google.colab import autoviz

def scatter_plots(df, colname_pairs, figscale=1, alpha=.8):
  from matplotlib import pyplot as plt
  plt.figure(figsize=(len(colname_pairs) * 6 * figscale, 6 * figscale))
  for plot_i, (x_colname, y_colname) in enumerate(colname_pairs, start=1):
    ax = plt.subplot(1, len(colname_pairs), plot_i)
    df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 * figscale), alpha=alpha, ax=ax)
    ax.spines[['top', 'right',]].set_visible(False)
  plt.tight_layout()
  return autoviz.MplChart.from_current_mpl_state()

chart = scatter_plots(df_results, *[[['PRICE', 'Predicted_price']]], **{})
chart

ModuleNotFoundError: No module named 'google.colab'

In [41]:
mae = numpy.mean(numpy.abs(df_results.Predicted_prices - df_results.PRICE))
mean_relative_error = numpy.mean(numpy.abs(df_results.Predicted_prices - df_results.PRICE)/df_results.PRICE)

print(mae)
print(mean_relative_error)

AttributeError: 'DataFrame' object has no attribute 'Predicted_prices'