In [1]:
import pandas as pd

df = pd.read_csv('miami-housing.csv')
df.head()

Unnamed: 0,LATITUDE,LONGITUDE,PARCELNO,SALE_PRC,LND_SQFOOT,TOT_LVG_AREA,SPEC_FEAT_VAL,RAIL_DIST,OCEAN_DIST,WATER_DIST,CNTR_DIST,SUBCNTR_DI,HWY_DIST,age,avno60plus,month_sold,structure_quality
0,25.891031,-80.160561,622280070620,440000.0,9375,1753,0,2815.9,12811.4,347.6,42815.3,37742.2,15954.9,67,0,8,4
1,25.891324,-80.153968,622280100460,349000.0,9375,1715,0,4359.1,10648.4,337.8,43504.9,37340.5,18125.0,63,0,9,4
2,25.891334,-80.15374,622280100470,800000.0,9375,2276,49206,4412.9,10574.1,297.1,43530.4,37328.7,18200.5,61,0,2,4
3,25.891765,-80.152657,622280100530,988000.0,12450,2058,10033,4585.0,10156.5,0.0,43797.5,37423.2,18514.4,63,0,9,4
4,25.891825,-80.154639,622280100200,755000.0,12800,1684,16681,4063.4,10836.8,326.6,43599.7,37550.8,17903.4,42,0,7,4


In [2]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(df)

## Clustering

In [3]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(X)

Using [Silhouette Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) to evaluate the cluster. Silhouette score calculates the difference the mean distance between a cluster and a sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [4]:
from sklearn.metrics import silhouette_score

print(f'Silhouette score: {silhouette_score(X, labels)}')

Silhouette score: 0.1931373591832463


As shown above, the score is near zero which means there are overlapping clusters. 

## Principle Component Analysis

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

In [6]:
print(f'Silhouette score: {silhouette_score(X_reduced, labels)}')

Silhouette score: 0.20563003890914042


## Neural Network

In [12]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, r2_score

x = df.drop('SALE_PRC', axis=1)
y = df['SALE_PRC']

# Splitting
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

# Scale 
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

# Building NN
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=x_train_scaled.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', 
              optimizer='adam')

# Train
model.fit(x_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate model
y_pred = model.predict(x_test_scaled)
model_loss = model.evaluate(x_test_scaled, y_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean squared error: {mse:.4f}')
print(f'MSE on test set: {model_loss:.4f}')
print(f'R squared: {r2:.4f}')

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 265044492288.0000 - val_loss: 256375259136.0000
Epoch 2/50
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 262428164096.0000 - val_loss: 254050713600.0000
Epoch 3/50
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 255640895488.0000 - val_loss: 247621468160.0000
Epoch 4/50
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 258557788160.0000 - val_loss: 236190072832.0000
Epoch 5/50
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 227098935296.0000 - val_loss: 219500216320.0000
Epoch 6/50
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 210730532864.0000 - val_loss: 197893177344.0000
Epoch 7/50
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 187032158208.0000 - val_loss: 172870647808.0000
Epoch 8/50
[1m24

MSE is quite high indicating that the predictions made by the regression model have a large average squared difference from the actual target values.In the context of housing price prediction, this means that the model’s predictions are significantly off from the true housing prices. In the future by exploring further improvements such as hyperparameter tuning can help with the regression model.