In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import sklearn

import matplotlib.pyplot as plt

In [2]:
# Load data

from sklearn.datasets import fetch_california_housing

housing_prices = fetch_california_housing(as_frame=True)

(X, y) = (housing_prices.data, housing_prices.target)

print(housing_prices.feature_names)
print(housing_prices.DESCR)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for Califo

In [3]:
X.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [4]:
X.head(8)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25


In [5]:
X.tail(8)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
20632,3.125,15.0,6.023377,1.080519,1047.0,2.719481,39.26,-121.45
20633,2.5495,27.0,5.445026,1.078534,1082.0,2.832461,39.19,-121.53
20634,3.7125,28.0,6.77907,1.148256,1041.0,3.026163,39.27,-121.56
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24


In [6]:
X.to_numpy()

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [7]:
y.to_numpy()

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [8]:
len(y)

20640

In [9]:
from sklearn.model_selection import train_test_split

train_ratio = 0.8
(X_train, X_ideal, y_train, y_ideal) = train_test_split(X, y.to_numpy(), test_size=(1 - train_ratio))

val_len = len(y_ideal) // 2
(X_val, y_val) = (X_ideal[0:val_len], y_ideal[0:val_len])
(X_test, y_test) = (X_ideal[val_len:], y_ideal[val_len:])

In [10]:
# X_train, X_val, X_test are pandas DataFrame, y_train, y_val, y_test are numpy ndarray
X_train.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,3.871813,28.644077,5.432538,1.095837,1425.326369,3.087516,35.62599,-119.56352
std,1.889899,12.558268,2.569068,0.494155,1132.005967,11.443698,2.134926,2.002443
min,0.4999,1.0,0.846154,0.375,5.0,0.692308,32.54,-124.35
25%,2.5625,18.0,4.44204,1.005864,788.0,2.430821,33.93,-121.79
50%,3.5377,29.0,5.234564,1.048309,1167.0,2.822316,34.25,-118.49
75%,4.75,37.0,6.057617,1.099057,1726.0,3.286105,37.71,-118.0
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

