<a href="https://colab.research.google.com/github/DorotaJanosz/neural-network-course/blob/master/04_regression/01_house_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import bibliotek

In [15]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

'2.6.0'

## Załadowanie danych i wstępna eksploracja

In [16]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [17]:
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [18]:
dataset.isnull().sum() / len(dataset)

longitude            0.000000
latitude             0.000000
housing_median_age   0.000000
total_rooms          0.000000
total_bedrooms       0.010029
population           0.000000
households           0.000000
median_income        0.000000
median_house_value   0.000000
ocean_proximity      0.000000
dtype: float64

In [19]:
dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

longitude            0.000000
latitude             0.000000
housing_median_age   0.000000
total_rooms          0.000000
total_bedrooms       0.000000
population           0.000000
households           0.000000
median_income        0.000000
median_house_value   0.000000
ocean_proximity      0.000000
dtype: float64

In [20]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [21]:
dataset.describe(include=['object'])

Unnamed: 0,ocean_proximity
count,20433
unique,5
top,<1H OCEAN
freq,9034


In [22]:
dataset.ocean_proximity.value_counts()

<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64

In [23]:
px.histogram(dataset, x='median_house_value')

In [24]:
dataset.median_house_value.value_counts()

500001.000000    958
137500.000000    119
162500.000000    116
112500.000000    103
187500.000000     92
                ... 
420500.000000      1
419000.000000      1
443000.000000      1
343600.000000      1
319500.000000      1
Name: median_house_value, Length: 3833, dtype: int64

In [25]:
index_to_drop = dataset[dataset.median_house_value == 500001].index
dataset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [26]:
dataset_dummies = pd.get_dummies(dataset, drop_first=True)
dataset_dummies.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


## Podział na zbiór treningowy i testowy

In [27]:
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

train_dataset length: 15580
test_dataset length: 3895


In [28]:
train_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-122.69,38.51,18.0,3364.0,501.0,1442.0,506.0,6.6854,313000.0,0,0,0,0
1859,-124.14,41.95,21.0,2696.0,578.0,1208.0,494.0,2.275,122400.0,0,0,0,1
51,-122.27,37.82,43.0,1868.0,456.0,1061.0,407.0,1.5045,93800.0,0,0,1,0
11192,-117.93,33.82,28.0,2444.0,555.0,1848.0,567.0,3.0179,198800.0,0,0,0,0
20355,-118.96,34.19,16.0,1807.0,346.0,587.0,296.0,1.9811,162500.0,0,0,0,0


In [29]:
test_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,281500.0,0,0,1,0
13,-122.26,37.84,52.0,696.0,191.0,345.0,174.0,2.6736,191300.0,0,0,1,0
19,-122.27,37.84,52.0,1503.0,298.0,690.0,275.0,2.6033,162900.0,0,0,1,0
26,-122.28,37.85,49.0,1130.0,244.0,607.0,239.0,2.4597,93800.0,0,0,1,0
40,-122.26,37.83,52.0,1665.0,419.0,946.0,395.0,2.0978,155400.0,0,0,1,0


In [30]:
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 'total_rooms'], color='median_house_value', height=700)

In [31]:
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,15580.0,-119.55878,2.006237,-124.35,-121.75,-118.49,-117.99,-114.47
latitude,15580.0,35.648614,2.147016,32.55,33.93,34.27,37.73,41.95
housing_median_age,15580.0,28.489217,12.505895,1.0,18.0,29.0,37.0,52.0
total_rooms,15580.0,2620.014506,2195.654212,2.0,1441.0,2112.0,3119.0,39320.0
total_bedrooms,15580.0,539.900578,424.499548,2.0,299.0,436.0,647.0,6445.0
population,15580.0,1441.193068,1160.542775,3.0,801.0,1179.0,1746.0,35682.0
households,15580.0,501.099487,385.039835,2.0,283.0,411.0,605.0,6082.0
median_income,15580.0,3.672427,1.570297,0.4999,2.519975,3.4405,4.581425,15.0001
ocean_proximity_INLAND,15580.0,0.333569,0.471503,0.0,0.0,0.0,1.0,1.0
ocean_proximity_ISLAND,15580.0,0.000193,0.013876,0.0,0.0,0.0,0.0,1.0


In [32]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

## Standaryzacja danych

In [33]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

In [34]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [35]:
normed_train_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
19234,-1.560743,1.332727,-0.838742,0.338845,-0.091639,0.000695,0.012727,1.918729,-0.707459,-0.013877,-0.346133,-0.37823
1859,-2.28349,2.93495,-0.598855,0.034607,0.089751,-0.200934,-0.018438,-0.889913,-0.707459,-0.013877,-0.346133,2.643727
51,-1.351396,1.011351,1.160315,-0.342501,-0.197646,-0.327599,-0.244389,-1.380585,-0.707459,-0.013877,2.888874,-0.37823


In [36]:
normed_test_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-1.346412,1.025324,1.879976,-0.190383,-0.249472,-0.457711,-0.257375,-0.298878,-0.707459,-0.013877,2.888874,-0.37823
13,-1.346412,1.020666,1.879976,-0.876283,-0.82191,-0.944552,-0.849521,-0.636075,-0.707459,-0.013877,2.888874,-0.37823
19,-1.351396,1.020666,1.879976,-0.508739,-0.569849,-0.647277,-0.587211,-0.680844,-0.707459,-0.013877,2.888874,-0.37823


In [37]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values

## Budowa modelu

In [39]:
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(train_dataset.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [40]:
model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              13312     
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 603,905
Trainable params: 603,905
Non-trainable params: 0
_________________________________________________________________


## Trenowanie sieci

In [41]:
history = model.fit(normed_train_data, train_labels.values, epochs=150, validation_split=0.2, verbose=1, batch_size=32)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [42]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [43]:
for name, value in zip(model.metrics_names, model.evaluate(normed_test_data, test_labels.values)):
    print(f'{name:8}{value:.4f}')

loss    2311819776.0000
mae     33657.5977
mse     2311819776.0000


In [44]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

array([204018.48, 164218.03, 149757.84, ...,  94150.81, 126509.66,  70822.72], dtype=float32)

In [45]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,204018.484375
13,191300.0,164218.03125
19,162900.0,149757.84375
26,93800.0,121481.9375
40,155400.0,156978.640625


In [46]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [47]:
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,204018.484375
13,191300.0,164218.03125
19,162900.0,149757.84375
26,93800.0,121481.9375
40,155400.0,156978.640625


In [48]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

Unnamed: 0,median_house_value,predictions,error
10,281500.0,204018.484375,77481.515625
13,191300.0,164218.03125,27081.96875
19,162900.0,149757.84375,13142.15625
26,93800.0,121481.9375,-27681.9375
40,155400.0,156978.640625,-1578.640625


In [49]:
px.histogram(pred, 'error', marginal='rug', width=1000)