# California Housing Dataset

In this example, we are using the default dataset California Housing Dataset present in the Google colab.

## Importing libraries

In [2]:
import math
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError
import kerastuner as kt

  if sys.path[0] == "":


## Reading the files

In [4]:
#Train file
train_df = pd.read_csv('california_housing_train.csv')

#Test file
test_df = pd.read_csv('california_housing_test.csv')

## Check the Train data and test data

In [5]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [6]:
test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [7]:
#shape of train and test
print(train_df.shape)
print(test_df.shape)

(17000, 9)
(3000, 9)


## Dataset Description

In [8]:
train_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [9]:
test_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205846.275
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113119.68747
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22500.0
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121200.0
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177650.0
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263975.0
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500001.0


The distribution of the train and test are pretty same

## Null value check

In [10]:
train_df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [11]:
test_df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

There are no null values in the dataset

## Split the data into dependent and independent features

In [12]:
TARGET_NAME = 'median_house_value'

In [13]:
# split the data into dependent and independent
x_train, y_train = train_df.drop(TARGET_NAME, axis=1), train_df[TARGET_NAME]
x_test, y_test = test_df.drop(TARGET_NAME, axis=1), test_df[TARGET_NAME]

## Feature Scaling

In [14]:
# Feature Scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Keras Tuner

An open-source Python library called Keras Tuner was created specifically for adjusting the hyperparameters of artificial neural networks. Currently, the Keras tuner supports four different types of tuners or algorithms:

* Bayesian Optimization
* Hyperband
* Sklearn
* Random Search

The Hyper model is the one you want to tune.

* Using the model-building feature we can enable this tuning feature

We are going to build the ann with 4 layers
1. Input layer
2. Hidden layer 1
3. Hidden layer 2
4. Output Layer

unit1, unit2, unit3 will have the range between 25 to 512 with step 32

These will be apply to input layer and hidden layers

The learning rate will be picked up based on given inputs and hence we are use choice method to perform that action

Since this is the regression problem, we are using 'linear' activation function in the output layer. on the hidden layers we are using 'relu'.

For weight updation, we are using adam optimizers and the loss will be MSLE (Mean Squarred Log Error)



In [15]:
msle = MeanSquaredLogarithmicError()


def build_annmodel(hp):
  model = tf.keras.Sequential()
  
  units1 = hp.Int('units1', min_value=25, max_value=512, step=32)
  units2 = hp.Int('units2', min_value=25, max_value=512, step=32)
  units3 = hp.Int('units3', min_value=25, max_value=512, step=32)
  model.add(Dense(units=units1, activation='relu'))
  model.add(tf.keras.layers.Dense(units=units2, activation='relu'))
  model.add(tf.keras.layers.Dense(units=units3, activation='relu'))
  model.add(Dense(1, kernel_initializer='normal', activation='linear'))

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
      loss=msle,
      metrics=[msle]
  )

  return model

In order to get a high-performing model quickly, the Hyperband tuning algorithm makes use of adaptive resource allocation and early-stopping.

In [16]:
# HyperBand algorithm from keras tuner
tuner = kt.Hyperband(
    build_annmodel,
    objective='val_mean_squared_logarithmic_error',
    max_epochs=10,
    directory='keras_tuner_dir',
    project_name='keras_tuner_demo'
)

In [17]:
tuner.search(x_train, y_train, epochs=10, validation_split=0.2)

Trial 30 Complete [00h 00m 26s]
val_mean_squared_logarithmic_error: 0.11534390598535538

Best val_mean_squared_logarithmic_error So Far: 0.10854613780975342
Total elapsed time: 00h 06m 44s
INFO:tensorflow:Oracle triggered exit


### The best hyper parameters are

In [18]:
for p in ['units1','units2','units3','learning_rate']:
  print(p, tuner.get_best_hyperparameters()[0].get(p))

units1 345
units2 377
units3 409
learning_rate 0.01


## Model Evaluation

In [19]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps

<keras_tuner.engine.hyperparameters.HyperParameters at 0x1ecad6213c8>

In [20]:
model = tuner.hypermodel.build(best_hps)

In [None]:
history = model.fit(x_train, y_train, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

In [33]:
eval_result = model.evaluate(x_test, y_test)
print(eval_result)

[0.08165794610977173, 0.0816333219408989]


In [34]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 505)               4545      
                                                                 
 dense_5 (Dense)             (None, 377)               190762    
                                                                 
 dense_6 (Dense)             (None, 89)                33642     
                                                                 
 dense_7 (Dense)             (None, 1)                 90        
                                                                 
Total params: 229,039
Trainable params: 229,039
Non-trainable params: 0
_________________________________________________________________
