In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.2.5'

In [2]:
from keras.datasets import boston_housing
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()

Downloading data from https://s3.amazonaws.com/keras-datasets/boston_housing.npz


In [3]:
train_data.shape, test_data.shape

((404, 13), (102, 13))

In [4]:
train_targets[:5]

array([15.2, 42.3, 50. , 21.1, 17.7])

여기서 볼 수 있듯이 404개의 훈련 샘플과 102개의 테스트 샘플이 있고 모두 13개의 수치 특성을 가지고 있습니다. 13개의 특성은 다음과 같습니다:

1. Per capita crime rate.
2. Proportion of residential land zoned for lots over 25,000 square feet.
3. Proportion of non-retail business acres per town.
4. Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
5. Nitric oxides concentration (parts per 10 million).
6. Average number of rooms per dwelling.
7. Proportion of owner-occupied units built prior to 1940.
8. Weighted distances to five Boston employment centres.
9. Index of accessibility to radial highways.
10. Full-value property-tax rate per $10,000.
11. Pupil-teacher ratio by town.
12. 1000 * (Bk - 0.63) ** 2 where Bk is the proportion of Black people by town.
13. % lower status of the population.

타깃은 주택의 중간 가격으로 천달러 단위입니다:



In [5]:
train_data[3] # 각각 특성 스케일이 다르니 스케일링 ㄱㄱ

array([3.9610e-02, 0.0000e+00, 5.1900e+00, 0.0000e+00, 5.1500e-01,
       6.0370e+00, 3.4500e+01, 5.9853e+00, 5.0000e+00, 2.2400e+02,
       2.0200e+01, 3.9690e+02, 8.0100e+00])

In [6]:
mean = train_data.mean(axis = 0); mean

array([3.74511057e+00, 1.14801980e+01, 1.11044307e+01, 6.18811881e-02,
       5.57355941e-01, 6.26708168e+00, 6.90106436e+01, 3.74027079e+00,
       9.44059406e+00, 4.05898515e+02, 1.84759901e+01, 3.54783168e+02,
       1.27408168e+01])

In [7]:
std = train_data.std(axis = 0); std

array([9.22929073e+00, 2.37382770e+01, 6.80287253e+00, 2.40939633e-01,
       1.17147847e-01, 7.08908627e-01, 2.79060634e+01, 2.02770050e+00,
       8.68758849e+00, 1.66168506e+02, 2.19765689e+00, 9.39946015e+01,
       7.24556085e+00])

In [0]:
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

In [0]:
from keras import models
from keras import layers

def build_model():
  model = models.Sequential()
  model.add(layers.Dense(64, activation = 'relu', input_shape = (train_data.shape[1],)))
  model.add(layers.Dense(64, activation = 'relu'))
  model.add(layers.Dense(1)) # 마지막 층은 순수한 선형
  model.compile(optimizer = 'rmsprop', loss = 'mse', metrics = ['mae'])
  return model

# K-Fold Cross Varidation

In [10]:
len(train_data)

404

In [11]:
(len(train_data) // 4)

101

In [19]:
import numpy as np

k = 4
num_val_samples = len(train_data) // k 
num_epochs = 100
all_scores = []
for i in range(k):
  print('처리 중인 폴드: ', i)
  val_data = train_data[i * num_val_samples : (i + 1) * num_val_samples]
  val_targets = train_targets[i * num_val_samples : (i + 1) * num_val_samples]

  partial_train_data = np.concatenate(
      [train_data[:i * num_val_samples],
       train_data[(i + 1) * num_val_samples:]], axis = 0)
  partial_train_targets = np.concatenate(
      [train_targets[:i * num_val_samples],
       train_targets[(i + 1) * num_val_samples:]],
       axis = 0)
  
  model = build_model()
  model.fit(partial_train_data, partial_train_targets,
            epochs = num_epochs, batch_size = 1, verbose = 1)
  # k fold는 validation set이 필요없음
  # 이거는 epochs를 다 학습하고, 결과를 보는 거임
  val_mae = model.evaluate(val_data, val_targets, verbose = 1)
  all_scores.append(val_mae)

처리 중인 폴드:  0
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77

In [20]:
import numpy as np

k = 4
num_val_samples = len(train_data) // k 
num_epochs = 100
all_score = []
for i in range(k):
  print('처리 중인 폴드: ', i)
  val_data = train_data[i * num_val_samples : (i + 1) * num_val_samples]
  val_targets = train_targets[i * num_val_samples : (i + 1) * num_val_samples]

  partial_train_data = np.concatenate(
      [train_data[:i * num_val_samples],
       train_data[(i + 1) * num_val_samples:]], axis = 0)
  partial_train_targets = np.concatenate(
      [train_targets[:i * num_val_samples],
       train_targets[(i + 1) * num_val_samples:]],
       axis = 0)
  
  model = build_model()
  history = model.fit(partial_train_data, partial_train_targets,
            validation_data = (val_data, val_targets),
            epochs = num_epochs, batch_size = 1, verbose = 2)
  # k fold는 validation set이 필요없음
  val_mae = history.history['val_mean_absolute_error']
  all_score.append(val_mae)

처리 중인 폴드:  0
Train on 303 samples, validate on 101 samples
Epoch 1/100
 - 2s - loss: 190.5046 - mean_absolute_error: 10.5488 - val_loss: 36.8163 - val_mean_absolute_error: 4.0629
Epoch 2/100
 - 1s - loss: 28.8558 - mean_absolute_error: 3.6463 - val_loss: 21.5360 - val_mean_absolute_error: 3.1988
Epoch 3/100
 - 1s - loss: 20.2501 - mean_absolute_error: 2.9508 - val_loss: 16.5936 - val_mean_absolute_error: 2.5942
Epoch 4/100
 - 1s - loss: 17.1111 - mean_absolute_error: 2.7958 - val_loss: 12.8066 - val_mean_absolute_error: 2.3414
Epoch 5/100
 - 1s - loss: 15.0399 - mean_absolute_error: 2.5988 - val_loss: 18.4855 - val_mean_absolute_error: 3.0379
Epoch 6/100
 - 1s - loss: 14.5093 - mean_absolute_error: 2.5532 - val_loss: 12.6412 - val_mean_absolute_error: 2.5162
Epoch 7/100
 - 1s - loss: 13.4289 - mean_absolute_error: 2.3984 - val_loss: 14.2956 - val_mean_absolute_error: 2.6079
Epoch 8/100
 - 1s - loss: 12.8987 - mean_absolute_error: 2.4035 - val_loss: 10.0125 - val_mean_absolute_error: 2.

In [21]:
all_scores

[[9.432748166641387, 2.2305078954979924],
 [13.693381290624638, 2.458357334136963],
 [13.61394755675061, 2.610022731346659],
 [11.298778210536089, 2.348523197787823]]

In [31]:
len(all_score) # 100개씩 4개

4

In [29]:
np.mean(all_score, axis = 1)

array([2.08184462, 2.52479431, 2.58561284, 2.67110891])

In [30]:
[np.mean([x[i] for x in all_score]) for i in range(num_epochs)]

[4.499269457146673,
 3.5931355551917954,
 3.036427705594809,
 2.798341358062064,
 2.8855189509911114,
 2.7502136726190547,
 2.7135200051978083,
 2.6209774749113786,
 2.6235734991507957,
 2.504440752586516,
 2.661472659299869,
 2.5855931716390175,
 2.443921458603132,
 2.445519847444969,
 2.534364328526034,
 2.444126031186321,
 2.5183112243614576,
 2.5274394538142895,
 2.6169833204533797,
 2.4416064118394756,
 2.538375619614478,
 2.4354120292285883,
 2.605474888688267,
 2.4262639827067307,
 2.395222974295663,
 2.3523252482461454,
 2.352421138546254,
 2.3798454445187405,
 2.320074404820357,
 2.3725551614666927,
 2.3182957998596794,
 2.4351019764890762,
 2.3913114094498136,
 2.4163966651010043,
 2.3512564295589335,
 2.409558920577021,
 2.3416993806857875,
 2.2987443881459755,
 2.4130559929526676,
 2.3376763126637674,
 2.440320561427881,
 2.343648337491668,
 2.519014370323408,
 2.4138382006399706,
 2.449020259451158,
 2.350064742683184,
 2.3443450868719875,
 2.3581807902543854,
 2.395324427