# Housing Price Approximation with Deep Neural Networks

#### import statements

In [207]:
import os

In [208]:
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
from sklearn.feature_selection import mutual_info_classif
from tensorflow.keras.metrics import R2Score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import math

## Training Model with Features Found From Mutual Information functions and using pd.dropna()

### Loading and Transforming Categorical Data into One Hot Vector Representations

In [209]:
training_set = pd.read_csv('data/train.csv')
features=training_set

In [210]:
for column in features:
    if features[column].dtype == 'object':
        one_hot = pd.get_dummies(features[column]).astype(int)
        one_hot.columns = [f'{col}_{column}' for col in one_hot.columns]

        features = features.drop(columns=column)
        features = features.join(one_hot, lsuffix='_df1', rsuffix='_df2')

In [211]:
features = features.dropna()
sales_price = features['SalePrice']
features = features.drop(columns='SalePrice')

### Finding and storing most influential features using mutual information

In [212]:
# commented out to avoid retraining (it takes a good amount of time)
# most important features were determined and exist as "solified features" two cells below

#mutual_info = mutual_info_classif(features, sales_price)

In [213]:
# top_features_indices = np.argsort(mutual_info)[::-1][:15]
# selected_features = features.columns[top_features_indices]

In [214]:
# after first run through, these are the solidifed columns from the top feature select, kept here to 
# avoid rerunning the method on 288 columns

solidified_features = ['AllPub_Utilities', 'Pave_Street', 'CompShg_RoofMatl', 
                        'Gtl_LandSlope', 'Norm_Condition2', 'TA_GarageCond', 
                        'Y_CentralAir', 'SBrkr_Electrical', 'KitchenAbvGr', 
                        'Y_PavedDrive', 'Typ_Functional', 'TA_GarageQual', 
                        'GasA_Heating', 'TA_BsmtCond', 'WD_SaleType']

features = features[solidified_features]

### Breaking up avaliable data into training and testing

In [215]:
seed = 5
np.random.seed(seed)
tf.set_random_seed(seed)
train_index = np.random.choice(len(features), round(len(features) * 0.7), replace=False)

In [216]:
X_train = features.head(int(len(features) * 0.7))

y_train = sales_price.head(int(len(features) * 0.7))

X_test = features.tail(int(len(features) * 0.3))

y_test = sales_price.tail(int(len(features) * 0.3))

# converting arrays to numpy arrays
numpy_x_train = X_train.values
numpy_y_train = y_train.values

numpy_x_test = X_test.values
numpy_y_test = y_test.values

In [217]:
def z_normalization(feat_val):
    feat_mean = np.mean(feat_val, axis=0)
    feat_sd = np.std(feat_val, axis=0)
    # Perform normalization
    return (feat_val - feat_mean) / feat_sd

In [218]:
# #z normalize
# X_train = z_normalization(X_train)
# X_test = z_normalization(X_test)

### Initializing the model architecture and training model

In [219]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=8, activation='linear', input_shape=(15,)),
    tf.keras.layers.Dense(units=4, activation='linear'),
    tf.keras.layers.Dense(units=1, activation='linear')])
# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [220]:
model.fit(numpy_x_train, numpy_y_train, epochs=1000)

Epoch 1/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 42493800448.0000
Epoch 2/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 921us/step - loss: 42492690432.0000
Epoch 3/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - loss: 42491265024.0000
Epoch 4/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step - loss: 42489352192.0000
Epoch 5/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884us/step - loss: 42486820864.0000
Epoch 6/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 42483523584.0000 
Epoch 7/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 42479296512.0000 
Epoch 8/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 42473988096.0000 
Epoch 9/1000
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4246744

<keras.src.callbacks.history.History at 0x2272e7d0950>

### Predicting model results and finding performance scores

In [221]:
results = model.predict(numpy_x_test)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [222]:
differences = []
for i in range(len(results)):
    differences.append(results[i] - numpy_y_test[i])    
print(sum(differences)/len(differences))

[9012.71991257]


In [176]:
np.std(differences)

74987.71475726394

In [179]:
# calculate standard deviaption of results
metric = R2Score()
metric.update_state(numpy_y_test, results)
result = metric.result()
print(f"R2 SCORE: {result.numpy()}")
print(f"Standard deviation of test data: {np.std(numpy_y_test)}")
print(f"Standard deviation of predicted data: {np.std(results)}")
print(f"Average Home Value of Predicted Results {sum(results)/len(results)}")
print(f"Average Home Value of Actual Data {sum(numpy_y_test)/len(numpy_y_test)}")

Standard deviation of predicted data: 15457.7197265625


## Training Model with Features Found From Mutual Information functions and using pd.fillna(0)

In [96]:
# Calculating model again but with FillNa as opposed to drop NA
training_set = pd.read_csv('data/train.csv')
features=training_set

for column in features:
    if features[column].dtype == 'object':
        one_hot = pd.get_dummies(features[column]).astype(int)
        one_hot.columns = [f'{col}_{column}' for col in one_hot.columns]

        features = features.drop(columns=column)
        features = features.join(one_hot, lsuffix='_df1', rsuffix='_df2')

features = features.fillna(0)
sales_price = features['SalePrice']
features = features.drop(columns='SalePrice')

# after first run through, these are the solidifed columns from the top feature select, kept here to 
# avoid rerunning the method on 288 columns

solidified_features = ['AllPub_Utilities', 'Pave_Street', 'CompShg_RoofMatl', 
                        'Gtl_LandSlope', 'Norm_Condition2', 'TA_GarageCond', 
                        'Y_CentralAir', 'SBrkr_Electrical', 'KitchenAbvGr', 
                        'Y_PavedDrive', 'Typ_Functional', 'TA_GarageQual', 
                        'GasA_Heating', 'TA_BsmtCond', 'WD_SaleType']

features = features[solidified_features]

X_train = features.head(int(len(features) * 0.7))

y_train = sales_price.head(int(len(features) * 0.7))

X_test = features.tail(int(len(features) * 0.3))

y_test = sales_price.tail(int(len(features) * 0.3))

numpy_x_train = X_train.values
numpy_y_train = y_train.values

In [97]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=8, activation='linear', input_shape=(15,)),
    tf.keras.layers.Dense(units=4, activation='linear'),
    tf.keras.layers.Dense(units=1, activation='linear')])
# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error')

model.fit(numpy_x_train, numpy_y_train, epochs=1000)

Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 871us/step - loss: 38950084608.0000 
Epoch 2/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 938us/step - loss: 38949408768.0000
Epoch 3/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 845us/step - loss: 38948343808.0000
Epoch 4/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 875us/step - loss: 38946516992.0000
Epoch 5/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 863us/step - loss: 38943645696.0000
Epoch 6/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 38939426816.0000 
Epoch 7/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 924us/step - loss: 38933561344.0000
Epoch 8/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 994us/step - loss: 38925758464.0000
Epoch 9/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 953us/step - loss: 38915723264.0

<keras.src.callbacks.history.History at 0x22723785050>

In [100]:
# calculating results and scores

results = model.predict(numpy_x_test)

# calculate R^2

metric = R2Score()
metric.update_state(numpy_y_test, results)
result = metric.result()
result.numpy()

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 935us/step


0.10560137

## Training Model with Features Commonly Used in Real Estate Approximation

In [203]:
training_set = pd.read_csv('data/train.csv')
features=training_set

for column in features:
    if features[column].dtype == 'object':
        one_hot = pd.get_dummies(features[column]).astype(int)
        one_hot.columns = [f'{col}_{column}' for col in one_hot.columns]

        features = features.drop(columns=column)
        features = features.join(one_hot, lsuffix='_df1', rsuffix='_df2')

features = features.fillna(0)
sales_price = features['SalePrice']
features = features.drop(columns='SalePrice')

# after first run through, these are the solidifed columns from the top feature select, kept here to 
# avoid rerunning the method on 288 columns

solidified_features = ['LotArea', 'OverallCond', 'YearBuilt', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr']

features = features[solidified_features]

X_train = features.head(int(len(features) * 0.7))

y_train = sales_price.head(int(len(features) * 0.7))

X_test = features.tail(int(len(features) * 0.3))

y_test = sales_price.tail(int(len(features) * 0.3))

numpy_x_train = X_train.values
numpy_y_train = y_train.values

numpy_x_test = X_test.values
numpy_y_test = y_test.values

In [204]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=8, activation='linear', input_shape=(8,)),
    tf.keras.layers.Dense(units=4, activation='linear'),
    tf.keras.layers.Dense(units=1, activation='linear')])
# Compile the model
model.compile(optimizer=optimizer, loss='mean_squared_error')

model.fit(numpy_x_train, numpy_y_train, epochs=1000)

Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 849us/step - loss: 37636644864.0000 
Epoch 2/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 854us/step - loss: 35957522432.0000
Epoch 3/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 811us/step - loss: 34114177024.0000
Epoch 4/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 840us/step - loss: 32037048320.0000
Epoch 5/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step - loss: 29723760640.0000
Epoch 6/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 27250599936.0000 
Epoch 7/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 24767059968.0000 
Epoch 8/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 22472511488.0000 
Epoch 9/1000
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 20552067072.0000 

<keras.src.callbacks.history.History at 0x2272de81010>

In [205]:
results = model.predict(numpy_x_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [202]:
# calculate performance results
metric = R2Score()
metric.update_state(numpy_y_test, results)
result = metric.result()
print(f"R2 SCORE: {result.numpy()}")
print(f"Standard deviation of test data: {np.std(numpy_y_test)}")
print(f"Standard deviation of predicted data: {np.std(results)}")
print(f"Average Home Value of Predicted Results {sum(results)/len(results)}")
print(f"Average Home Value of Actual Data {sum(numpy_y_test)/len(numpy_y_test)}")

R2 SCORE: 0.10077953338623047
Standard deviation of test data: 78442.2361088657
Standard deviation of predicted data: 10938.005859375
Average Home Value of Predicted Results [181295.5]
Average Home Value of Actual Data 179073.0593607306


In [206]:
differences = []
for i in range(len(results)):
    differences.append(results[i] - numpy_y_test[i])    

print(sum(differences)/len(differences))

[2195.18535959]
