## Нейросеть для California housing

Будем работать с набором данным `fetch_california_housing` из `sklearn`.

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from tqdm import tqdm
from tqdm.keras import TqdmCallback
from keras.models import Sequential
from keras.layers.core import Dense
from sklearn.metrics import mean_absolute_error

data = fetch_california_housing()

X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

#### 1.1. Разбейте данные на обучение и тест

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((16512, 8), (4128, 8))

#### 1.2 Нормализовать данные

In [None]:
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

#### 1.3. Создайте архитектуру, обучите нейросеть

Поэкспериментируйте с разными архитектурами и гипер-параметрами. 


##### Code-behind

In [None]:
import plotly.graph_objects as go

def train_and_print_stats(model, epochs, batch_size):
  print(model.summary())
  history = model.fit(
      X_train, 
      y_train, 
      epochs=epochs, 
      validation_split=0.05, 
      batch_size=100, 
      verbose=0, 
      callbacks=[TqdmCallback(verbose=1)]
  )


  fig = go.Figure()
  for key in history.history.keys():
    fig.add_trace(go.Scattergl(y=history.history[key], name=key))

  fig.update_layout(
      height=500, 
      width=700,
      xaxis_title='Epoch',
      yaxis_title='Loss')

  fig.show()

  mse_nn, mae_nn = model.evaluate(X_test, y_test)

  print('Mean squared error on test data: ', mse_nn)
  print('Mean absolute error on test data: ', mae_nn)

  Xnew = model.predict(X_test)

  print(f"Средняя ошибка: ${int(mean_absolute_error(Xnew, y_test) * 100000)}")

##### Experiments

In [None]:
model = Sequential()
model.add(Dense(512, input_shape=(8,), activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(1, activation='linear', name='dense_output'))

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

train_and_print_stats(model, epochs=100, batch_size=40)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 512)               4608      
                                                                 
 dense_9 (Dense)             (None, 256)               131328    
                                                                 
 dense_output (Dense)        (None, 1)                 257       
                                                                 
Total params: 136,193
Trainable params: 136,193
Non-trainable params: 0
_________________________________________________________________
None


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Mean squared error on test data:  0.259633868932724
Mean absolute error on test data:  0.34398823976516724
Средняя ошибка: $34398


In [None]:
model = Sequential()
model.add(Dense(512, input_shape=(8,), activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(1, activation='linear', name='dense_output'))

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

train_and_print_stats(model, epochs=40, batch_size=50)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 512)               4608      
                                                                 
 dense_11 (Dense)            (None, 256)               131328    
                                                                 
 dense_output (Dense)        (None, 1)                 257       
                                                                 
Total params: 136,193
Trainable params: 136,193
Non-trainable params: 0
_________________________________________________________________
None


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Mean squared error on test data:  0.2822694480419159
Mean absolute error on test data:  0.34317827224731445
Средняя ошибка: $34317


Исходя из полученной модели можно сделать вывод, что модель предсказывает значения с точностью в пределах $35 000. 
С учётом того, с какими данными мы работаем и какого они порядка, это достаточно хорошая точность.

Для получения таких результатов достаточно 50 эпох