In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.python.data import Dataset
import math
from sklearn import metrics
from __future__ import print_function
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [3]:
california_data = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
california_data = california_data.reindex(np.random.permutation(california_data.index))

In [4]:
california_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.6,35.6,28.6,2643.7,539.4,1429.6,501.2,3.9,207300.9
std,2.0,2.1,12.6,2179.9,421.5,1147.9,384.5,1.9,115983.8
min,-124.3,32.5,1.0,2.0,1.0,3.0,1.0,0.5,14999.0
25%,-121.8,33.9,18.0,1462.0,297.0,790.0,282.0,2.6,119400.0
50%,-118.5,34.2,29.0,2127.0,434.0,1167.0,409.0,3.5,180400.0
75%,-118.0,37.7,37.0,3151.2,648.2,1721.0,605.2,4.8,265000.0
max,-114.3,42.0,52.0,37937.0,6445.0,35682.0,6082.0,15.0,500001.0


In [18]:
def preprocess_features(california_data):
    selected_data = california_data[["latitude","longitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income"]]
    processed_features = selected_data.copy()
    processed_features['rooms_per_person'] = selected_data['total_rooms']/selected_data['population']
    return processed_features

def preprocess_targets(california_data):
    output_data = pd.DataFrame()
    output_data["median_house_value"] = california_data["median_house_value"]/1000
    return output_data


In [19]:
training_examples = preprocess_features(california_data.head(12000))
print(training_examples.describe())
training_targets = preprocess_targets(california_data.head(12000))
print(training_examples.describe())

       latitude  longitude  housing_median_age  total_rooms  total_bedrooms  \
count   12000.0    12000.0             12000.0      12000.0         12000.0   
mean       35.6     -119.6                28.6       2637.2           537.7   
std         2.1        2.0                12.6       2208.4           423.7   
min        32.5     -124.3                 1.0          2.0             1.0   
25%        33.9     -121.8                18.0       1460.0           295.0   
50%        34.2     -118.5                29.0       2122.0           433.0   
75%        37.7     -118.0                37.0       3146.2           646.0   
max        42.0     -114.5                52.0      37937.0          6445.0   

       population  households  median_income  rooms_per_person  
count     12000.0     12000.0        12000.0           12000.0  
mean       1426.3       499.6            3.9               2.0  
std        1175.0       387.1            1.9               1.1  
min           6.0         1.

In [20]:
validation_examples = preprocess_features(california_data.tail(5000))
validation_examples.describe()
validation_targets = preprocess_targets(california_data.tail(5000))


In [21]:
#Cross check if training and validation examples look similar
training_examples.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,35.6,-119.6,28.6,2637.2,537.7,1426.3,499.6,3.9,2.0
std,2.1,2.0,12.6,2208.4,423.7,1175.0,387.1,1.9,1.1
min,32.5,-124.3,1.0,2.0,1.0,6.0,1.0,0.5,0.0
25%,33.9,-121.8,18.0,1460.0,295.0,783.0,281.0,2.6,1.5
50%,34.2,-118.5,29.0,2122.0,433.0,1164.0,408.0,3.5,1.9
75%,37.7,-118.0,37.0,3146.2,646.0,1718.0,603.0,4.7,2.3
max,42.0,-114.5,52.0,37937.0,6445.0,35682.0,6082.0,15.0,52.0


In [22]:
validation_examples.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,35.6,-119.6,28.6,2659.2,543.6,1437.5,505.2,3.9,2.0
std,2.1,2.0,12.6,2110.2,416.1,1079.9,378.3,1.9,1.2
min,32.6,-124.3,1.0,18.0,4.0,3.0,2.0,0.5,0.1
25%,33.9,-121.8,18.0,1467.8,300.8,805.8,284.0,2.6,1.5
50%,34.2,-118.5,29.0,2143.5,435.0,1176.0,410.0,3.6,1.9
75%,37.7,-118.0,37.0,3166.0,657.2,1729.2,611.0,4.8,2.3
max,41.9,-114.3,52.0,25957.0,4952.0,11272.0,4616.0,15.0,55.2


In [23]:
def get_quantile_based_boundaries(feature_values, bucket_size):
    boundaries = np.arange(1.0,bucket_size)/bucket_size
    quantile = feature_values.quantile(boundaries)
    return [quantile[i] for i in quantile.keys()]

In [33]:
def construct_feature_columns(california_data):
    rooms_per_person = tf.feature_column.numeric_column('rooms_per_person')
    median_income = tf.feature_column.numeric_column('median_income')
    latitude = tf.feature_column.numeric_column('latitude')
    longitude = tf.feature_column.numeric_column('longitude')
    housing_median_age = tf.feature_column.numeric_column('housing_median_age')
    households = tf.feature_column.numeric_column('households')
    
    bucketized_rooms_per_person = tf.feature_column.bucketized_column(rooms_per_person, 
                                        boundaries=get_quantile_based_boundaries(california_data['rooms_per_person'],7))
    bucketized_median_income =     tf.feature_column.bucketized_column(median_income,
                                       boundaries=get_quantile_based_boundaries(california_data['median_income'],7))
    bucketized_latitude =     tf.feature_column.bucketized_column(latitude, 
                                       boundaries=get_quantile_based_boundaries(california_data['latitude'],10))
    bucketized_longitude =     tf.feature_column.bucketized_column(longitude,
                                      boundaries=get_quantile_based_boundaries(california_data['longitude'],10))
    bucketized_housing_median_age =     tf.feature_column.bucketized_column(housing_median_age, 
                                       boundaries=get_quantile_based_boundaries(california_data['housing_median_age'],7))
    
    bucketized_households = tf.feature_column.bucketized_column(households,
                                        boundaries=get_quantile_based_boundaries(california_data['households'],7))
    
    feature_columns = set([bucketized_rooms_per_person, bucketized_median_income, bucketized_latitude, bucketized_longitude, bucketized_housing_median_age, bucketized_households])
    return feature_columns

def my_input_fn(features, targets, batch_size = 1, shuffle = True, num_epochs = None):
    features = {key:np.array(value) for key,value in dict(features).items()}
    ds = Dataset.from_tensor_slices((features,targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    if shuffle:
        ds = ds.shuffle(10000)
    features, targets = ds.make_one_shot_iterator().get_next()
    return features, targets


In [34]:
def my_train(steps, learning_rate, batch_size, training_examples, training_targets, validation_examples, validation_targets):
    #create a linear regressor object
    feature_columns = construct_feature_columns(training_examples)
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5)
    linear_regressor = tf.estimator.LinearRegressor(feature_columns=feature_columns, optimizer=my_optimizer)
    
    #create input functions
    training_input_fn = lambda: my_input_fn(training_examples, training_targets)
    training_predict_input_fn = lambda: my_input_fn(training_examples, training_targets, shuffle = False, num_epochs = 1)
    validation_input_fn = lambda: my_input_fn(validation_examples, validation_targets, shuffle = False, num_epochs = 1)
    periods = 10
    steps_per_period = steps/periods
    train_RMSE = []
    validation_RMSE = []
    for period in range(1,periods):
        print("period: ",period)
        linear_regressor.train(input_fn = training_input_fn, steps = steps_per_period)
        training_predictions = linear_regressor.predict(input_fn = training_predict_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])
        validation_predictions = linear_regressor.predict(input_fn = validation_input_fn)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
        train_rmse = math.sqrt(metrics.mean_squared_error(training_predictions, training_targets))
        print("Training rmse: ",train_rmse)
        validation_rmse = math.sqrt(metrics.mean_squared_error(validation_predictions, validation_targets))
        print("Validation rmse: ", validation_rmse)
        train_RMSE.append(train_rmse)
        validation_RMSE.append(validation_rmse)
    print("Model training finished")
    return linear_regressor


In [37]:
linear_regressor = my_train(steps = 100, learning_rate = 1, batch_size = 5, 
                            training_examples = training_examples,
                            training_targets = training_targets,
                           validation_examples = validation_examples,
                           validation_targets = validation_targets)

period:  1
Training rmse:  208.85048931074738
Validation rmse:  208.29486963934048
period:  2
Training rmse:  181.63562862804986
Validation rmse:  180.98665149661664
period:  3
Training rmse:  156.7569984062912
Validation rmse:  156.14175214399165
period:  4
Training rmse:  143.60440440969745
Validation rmse:  142.99972908385843
period:  5
Training rmse:  138.4409313695779
Validation rmse:  137.75181094375586
period:  6
Training rmse:  127.95169447857853
Validation rmse:  127.34451446282706
period:  7
Training rmse:  122.88854424059568
Validation rmse:  122.12145870490244
period:  8
Training rmse:  117.88485645428557
Validation rmse:  116.99803616474836
period:  9
Training rmse:  120.55795051564387
Validation rmse:  119.7684390672519
Model training finished


In [None]:
#Evaluating the model on test data
california_housing_test_data = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_test.csv", sep=",")
california_housing_test_data.describe()



In [None]:
#create input fn for test
test_features = preprocess_features(california_housing_test_data)
test_targets = preprocess_targets(california_housing_test_data)
test_features.describe()
test_input_fn = lambda: my_input_fn(test_features, test_targets, shuffle = False, num_epochs = None)
test_predictions = linear_regressor.predict(input_fn=test_input_fn)
test_predictions = np.array([item['predictions'][0] for item in test_predictions])
test_rmse = math.sqrt(metrics.mean_squared_error(test_predictions, test_targets))
print("Test RMSE: ",test_rmse)

In [16]:
#Find correlation between features and target to find the best possible features:
corr_data = training_examples.copy()
corr_data['target'] = california_data['median_house_value']
corr_data.corr()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person,target
housing_median_age,1.0,-0.4,-0.3,-0.3,-0.3,-0.1,-0.1,0.1
total_rooms,-0.4,1.0,0.9,0.9,0.9,0.2,0.1,0.1
total_bedrooms,-0.3,0.9,1.0,0.9,1.0,-0.0,0.1,0.0
population,-0.3,0.9,0.9,1.0,0.9,0.0,-0.1,-0.0
households,-0.3,0.9,1.0,0.9,1.0,0.0,-0.0,0.1
median_income,-0.1,0.2,-0.0,0.0,0.0,1.0,0.2,0.7
rooms_per_person,-0.1,0.1,0.1,-0.1,-0.0,0.2,1.0,0.2
target,0.1,0.1,0.0,-0.0,0.1,0.7,0.2,1.0


In [11]:
#take only features correlated with target
minimal_features = ['median_income','rooms_per_person']
minimal_training_examples = training_examples[minimal_features]
minimal_validation_examples = validation_examples[minimal_features]
minimal_validation_examples.describe()


Unnamed: 0,median_income,rooms_per_person
count,5000.0,5000.0
mean,3.9,2.0
std,1.9,1.2
min,0.5,0.3
25%,2.6,1.5
50%,3.5,1.9
75%,4.8,2.3
max,15.0,52.0


In [12]:
linear_regressor = my_train(steps = 100, learning_rate = 0.0005, batch_size = 5, 
                           training_examples = minimal_training_examples,
                           training_targets = training_targets,
                           validation_examples = minimal_validation_examples,
                           validation_targets = validation_targets)


period:  1
Training rmse:  237.08526082091745
Validation rmse:  238.25091303386725
period:  2
Training rmse:  236.97602812898177
Validation rmse:  238.1418850310247
period:  3
Training rmse:  236.86797524749923
Validation rmse:  238.03403407139135
period:  4
Training rmse:  236.7572453328075
Validation rmse:  237.92351404062526
period:  5
Training rmse:  236.64780679535687
Validation rmse:  237.81427776548182
period:  6
Training rmse:  236.53712585429363
Validation rmse:  237.70382347305895
period:  7
Training rmse:  236.4282405594053
Validation rmse:  237.5951367027229
period:  8
Training rmse:  236.32157086431013
Validation rmse:  237.4886644180862
period:  9
Training rmse:  236.21203724702164
Validation rmse:  237.37934751479165
Model training finished


In [46]:
#Now using latitude bins as features

def select_and_transform_features(source_df):
    new_df = pd.DataFrame()
    new_df['median_income'] = source_df['median_income']
    new_df['rooms_per_person'] = source_df['rooms_per_person']
    for i in LATITUDE_RANGE:
        new_df['latitude_%d_to_%d' %i] = source_df['latitude'].apply(lambda x: 1 if (x >= i[0] and x < i[1]) else 0)
    return new_df


In [47]:
LATITUDE_RANGE = zip(range(32,41),range(33,42))
transformed_training_examples = select_and_transform_features(training_examples)
LATITUDE_RANGE = zip(range(32,41),range(33,42))
transformed_validation_examples = select_and_transform_features(validation_examples)


In [48]:
transformed_training_examples.head()

Unnamed: 0,median_income,rooms_per_person,latitude_32_to_33,latitude_33_to_34,latitude_34_to_35,latitude_35_to_36,latitude_36_to_37,latitude_37_to_38,latitude_38_to_39,latitude_39_to_40,latitude_40_to_41
10399,2.7,3.4,0,0,0,0,0,0,1,0,0
11388,4.5,2.2,0,0,0,0,0,0,1,0,0
5105,4.4,2.0,0,1,0,0,0,0,0,0,0
15993,2.9,2.1,0,0,0,0,0,1,0,0,0
9074,2.5,2.1,0,0,0,1,0,0,0,0,0


In [49]:
transformed_validation_examples.head()

Unnamed: 0,median_income,rooms_per_person,latitude_32_to_33,latitude_33_to_34,latitude_34_to_35,latitude_35_to_36,latitude_36_to_37,latitude_37_to_38,latitude_38_to_39,latitude_39_to_40,latitude_40_to_41
9524,1.6,2.2,0,0,1,0,0,0,0,0,0
1633,2.5,1.7,1,0,0,0,0,0,0,0,0
2712,8.0,2.3,0,0,1,0,0,0,0,0,0
4591,2.6,0.7,0,0,1,0,0,0,0,0,0
7189,3.7,1.6,0,0,1,0,0,0,0,0,0


In [52]:
my_train(steps = 500, learning_rate=0.01, batch_size = 5, 
         training_examples = transformed_training_examples,
        training_targets = training_targets, validation_examples = transformed_validation_examples,
        validation_targets = validation_targets)

period:  1
Training rmse:  226.42350856003662
Validation rmse:  227.61094348917314
period:  2
Training rmse:  215.75372172587007
Validation rmse:  216.96309673973244
period:  3
Training rmse:  205.15904954395782
Validation rmse:  206.39010821693435
period:  4
Training rmse:  194.73479981372714
Validation rmse:  195.9871949304761
period:  5
Training rmse:  184.3828310817724
Validation rmse:  185.65800982490686
period:  6
Training rmse:  174.16574713244347
Validation rmse:  175.4644963705341
period:  7
Training rmse:  164.11396111009375
Validation rmse:  165.4363619845115
period:  8
Training rmse:  155.09291756300775
Validation rmse:  156.43618907668562
period:  9
Training rmse:  145.60492955017497
Validation rmse:  146.96759028854316
Model training finished


<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x20d3d8626d8>