In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
house_raw_data = pd.read_csv("cal_housing_clean.csv")

In [3]:
house_raw_data.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [4]:
x_data = house_raw_data.drop("medianHouseValue",axis=1)

In [5]:
y = house_raw_data["medianHouseValue"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y, test_size=0.33, random_state=101)

In [55]:
X_train.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome
16624,27.0,2863.0,711.0,930.0,533.0,2.6205
14848,19.0,3635.0,1078.0,3127.0,1098.0,1.324
11081,34.0,2004.0,331.0,843.0,328.0,3.59
930,23.0,3170.0,532.0,1446.0,515.0,4.4357
5889,40.0,3068.0,756.0,1190.0,695.0,3.5637


In [7]:
X_train.size

82968

In [8]:
X_test.size

40872

In [9]:
# normalize data which will be between 0 and 1
# ref) https://www.quora.com/What-is-the-meaning-of-min-max-normalization
minMaxScaler = MinMaxScaler()

In [10]:
# change scaler so it fits into data
minMaxScaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [11]:
scaled_X_train = pd.DataFrame(data=minMaxScaler.transform(X_train), columns=X_train.columns, index= X_train.index)

In [12]:
scaled_X_train.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome
16624,0.509804,0.072766,0.11018,0.032455,0.087486,0.146246
14848,0.352941,0.0924,0.167132,0.109372,0.180398,0.056834
11081,0.647059,0.050918,0.05121,0.029409,0.053774,0.213107
930,0.431373,0.080574,0.082402,0.05052,0.084526,0.271431
5889,0.764706,0.07798,0.117163,0.041557,0.114126,0.211294


In [13]:
scaled_X_test = pd.DataFrame(data=minMaxScaler.transform(X_test), columns= X_test.columns, index= X_test.index)

In [14]:
house_raw_data.columns

Index(['housingMedianAge', 'totalRooms', 'totalBedrooms', 'population',
       'households', 'medianIncome', 'medianHouseValue'],
      dtype='object')

In [15]:
age = tf.feature_column.numeric_column(key="housingMedianAge")
num_rooms = tf.feature_column.numeric_column(key="totalRooms")
num_bed = tf.feature_column.numeric_column(key="totalBedrooms")
population = tf.feature_column.numeric_column(key="population")
households = tf.feature_column.numeric_column(key="households")
income = tf.feature_column.numeric_column(key="medianIncome")

In [73]:
feature_cols = [age, num_rooms, num_bed, population,households, income]

In [153]:
# instructor said he decided to have 6 units because we have 6 features
model = tf.estimator.DNNRegressor(feature_columns= feature_cols, hidden_units=[6,6,6,6,6] )
# model = tf.estimator.LinearRegressor(feature_columns= feature_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/9c/883h376s53500dcqng4b4nj00000gn/T/tmp_456mpre', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1c41333cf8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [165]:
input_func = tf.estimator.inputs.pandas_input_fn(x= scaled_X_train, y= y_train, batch_size=128 , num_epochs= 200, shuffle= True )

In [None]:
model.train(input_fn= input_func)

In [167]:
# if i use scaled x data , i have use scaled test data
test_input_func = tf.estimator.inputs.pandas_input_fn(x= scaled_X_test,y= y_test , batch_size= 10, num_epochs= 1, shuffle= False)

In [None]:
test_results = model.evaluate(input_fn= test_input_func)

In [169]:
test_results

{'average_loss': 5956340000.0,
 'label/mean': 206802.39,
 'loss': 59493530000.0,
 'prediction/mean': 205763.0,
 'global_step': 41607}

In [170]:
# using same test data for predictions
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=scaled_X_test, y= y_test , batch_size= 10, num_epochs= 1, shuffle= False)

In [171]:
pred_obj = model.predict(input_fn=pred_input_func)

In [None]:
predictions = list(pred_obj)
predictions

In [None]:
final_preds = []
for pred in predictions:
    final_preds.append(pred["predictions"])
    print(pred["predictions"])

In [174]:
house_raw_data.describe()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [175]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, final_preds)**0.5

77177.3342160244