# 房價預測模型
---
模型01 - 對照組

使用Kaggle資料集預測房屋價格

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
data = pd.read_csv("kc_house_data.csv")
data.shape

pd.options.display.max_columns = 25
data.head()

# data.dtypes

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
data["year"] = pd.to_numeric(data["date"].str.slice(0, 4))
data["month"] = pd.to_numeric(data["date"].str.slice(4, 6))
data["day"] = pd.to_numeric(data["date"].str.slice(6, 8))

data.drop(["id"], axis=1, inplace=True)
data.drop(["date"], axis=1, inplace=True)
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,month,day
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,2014,10,13
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,2014,12,9
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,2015,2,25
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,2014,12,9
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,2015,2,18


In [4]:
data_num = data.shape[0]
indexs = np.random.permutation(data_num)

train_indexs = indexs[:int(data_num*0.6)]
val_indexs = indexs[int(data_num*0.6):int(data_num*0.8)]
test_indexs = indexs[int(data_num*0.8):]

train_data = data.loc[train_indexs]
val_data = data.loc[val_indexs]
test_data = data.loc[test_indexs]

train_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,month,day
18364,260000.0,3,1.75,1670,8511,1.0,0,0,3,7,1340,330,1985,0,98001,47.3257,-122.276,1580,7218,2014,8,29
12816,760000.0,5,2.0,3920,5250,1.5,0,0,5,7,2560,1360,1910,0,98144,47.5798,-122.294,1830,4240,2014,12,8
5190,772650.0,4,2.5,2660,10800,1.0,0,0,3,7,2660,0,1955,2014,98005,47.5894,-122.172,2640,10800,2014,6,26
16849,392000.0,3,2.25,1790,7125,1.0,0,0,3,7,1220,570,1974,0,98034,47.7184,-122.226,2040,7950,2014,9,2
12354,450000.0,3,2.5,1670,2589,2.0,0,0,3,8,1670,0,2000,0,98027,47.5314,-122.047,1670,2897,2014,8,13


In [5]:
mean = train_data.mean()
std = train_data.std()
train_data = (train_data - mean) / std

mean = val_data.mean()
std = val_data.std()
val_data = (val_data - mean) / std

# val_data.mean()
# val_data.std()

In [6]:
x_train = np.array(train_data.drop("price", axis=1))
y_train = np.array(train_data["price"])
x_val = np.array(val_data.drop("price", axis=1))
y_val = np.array(val_data["price"])

print(x_val.shape)
print(y_val.shape)

(4323, 21)
(4323,)


In [7]:
model = keras.Sequential(name="model-1")

model.add(layers.Dense(64, activation="relu", input_shape=(21, )))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(1))

print(model.summary())

Model: "model-1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                1408      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 5,633
Trainable params: 5,633
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
model.compile(keras.optimizers.Adam(0.001), loss=keras.losses.MeanSquaredError(), metrics=[keras.metrics.MeanAbsoluteError()])

In [9]:
model_dir = "logs/models/"
if not os.path.exists(model_dir):
    print("new dir!")
    os.makedirs(model_dir)

log_dir = os.path.join("logs", "model-1")
model_cdk = keras.callbacks.TensorBoard(log_dir=log_dir)
model_mckp = keras.callbacks.ModelCheckpoint(model_dir+"best-model-1.h5", monitor="val_mean_absolute_error", 
                                             save_best_only=True, mode="min")

In [10]:
history = model.fit(x_train, y_train, batch_size=64, epochs=100,
                    validation_data=(x_val, y_val), callbacks=[model_cdk, model_mckp])

Epoch 1/100


InternalError:  Attempting to perform BLAS operation using StreamExecutor without BLAS support
	 [[node model-1/dense/MatMul (defined at <ipython-input-10-0433decf0386>:1) ]] [Op:__inference_train_function_656]

Function call stack:
train_function


In [None]:
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="val")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend(loc="upper right")
plt.title('MSE')

In [None]:
plt.plot(history.history["mean_absolute_error"], label="train")
plt.plot(history.history["val_mean_absolute_error"], label="val")
plt.xlabel("epochs")
plt.ylabel("metrics")
plt.legend(loc="upper right")
plt.title('MAE')

In [None]:
model.load_weights("logs/models/best-model-1.h5")

y_test = np.array(test_data["price"])
mean = test_data.mean()
std = test_data.std()
test_data = (test_data - mean) / std
x_test = np.array(test_data.drop("price", axis=1))

y_pred = model.predict(x_test)

y_pred = np.reshape(y_pred * std["price"] + mean["price"], y_test.shape)
precentage_error = np.mean(np.abs(y_test - y_pred)) / np.mean(y_test) * 100

print("Model precentage error: {:.2f}%".format(precentage_error))

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir logs

In [None]:
# %reload_ext tensorboard