# Validation programming exercise

In [1]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")

In [3]:
def preprocess_features(california_housing_dataframe):
    """
    Prepares input features from California housing data set
    
    Args:
        california_housing_dataframe : Pandas DataFrame containing CA housing data
    Returns:
        Pandas DataFrame containing features used for the model, including synthetic feature
    """
    
    selected_features = california_housing_dataframe[
        ['latitude',
         'longitude',
         'housing_median_age',
         'total_rooms',
         'total_bedrooms',
         'population',
         'households',
         'median_income']]
    processed_features = selected_features.copy()
    
    #synthetic feature
    processed_features['rooms_per_person'] = california_housing_dataframe['total_rooms'] / california_housing_dataframe['population']
    
    return processed_features

In [8]:
def preprocess_targets(california_housing_dataframe):
    """
    Prepare target features (i.e. labels) from CA housing dataset
    
    Args:
        california_housing_dataframe : Pandas DataFrame containing CA housing data
    Returns:
        Pandas DataFrame containing features to be used for the model, including synthetic feature
    """
    output_targets = pd.DataFrame()
    output_targets['median_house_value'] = california_housing_dataframe['median_house_value'] / 1000.0
    return output_targets

### split data sets ( training / validation ), 12,000 | 17,000

In [6]:
training_examples = preprocess_features(california_housing_dataframe).head(12000)
training_examples.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,34.6,-118.5,27.5,2655.7,547.1,1476.0,505.4,3.8,1.9
std,1.6,1.2,12.1,2258.1,434.3,1174.3,391.7,1.9,1.3
min,32.5,-121.4,1.0,2.0,2.0,3.0,2.0,0.5,0.0
25%,33.8,-118.9,17.0,1451.8,299.0,815.0,283.0,2.5,1.4
50%,34.0,-118.2,28.0,2113.5,438.0,1207.0,411.0,3.5,1.9
75%,34.4,-117.8,36.0,3146.0,653.0,1777.0,606.0,4.6,2.3
max,41.8,-114.3,52.0,37937.0,5471.0,35682.0,5189.0,15.0,55.2


In [9]:
training_targets = preprocess_targets(california_housing_dataframe).head(12000)
training_targets.describe()

Unnamed: 0,median_house_value
count,12000.0
mean,198.0
std,111.9
min,15.0
25%,117.1
50%,170.5
75%,244.4
max,500.0


In [13]:
validation_examples = preprocess_features(california_housing_dataframe).tail(17000)
validation_examples.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,35.6,-119.6,28.6,2643.7,539.4,1429.6,501.2,3.9,2.0
std,2.1,2.0,12.6,2179.9,421.5,1147.9,384.5,1.9,1.2
min,32.5,-124.3,1.0,2.0,1.0,3.0,1.0,0.5,0.0
25%,33.9,-121.8,18.0,1462.0,297.0,790.0,282.0,2.6,1.5
50%,34.2,-118.5,29.0,2127.0,434.0,1167.0,409.0,3.5,1.9
75%,37.7,-118.0,37.0,3151.2,648.2,1721.0,605.2,4.8,2.3
max,42.0,-114.3,52.0,37937.0,6445.0,35682.0,6082.0,15.0,55.2


In [15]:
validation_targets = preprocess_targets(california_housing_dataframe).tail(17000)
validation_targets.describe()

Unnamed: 0,median_house_value
count,17000.0
mean,207.3
std,116.0
min,15.0
25%,119.4
50%,180.4
75%,265.0
max,500.0


## Task 1 : Examine data


- 'median income' scale not really known
- 