# Minimal example: build a model with TensorFlow estimator


## Load libraries

In [2]:
import math
import shutil  # for shell utilities
import numpy as np
import pandas as pd
import tensorflow as tf

print(tf.__version__)

2.0.0


## Prepare data
The data is based on 1990 census data from California. This data is at the city block level, so these features reflect the total number of rooms in that block, or the total number of people who live on that block, respectively.

TensorFlow does **not** require input in the format of X and y. So keep all columns in the dataframe.

In [7]:
# Read data into dataframe
df = pd.read_csv("https://storage.googleapis.com/ml_universities/california_housing_train.csv", sep=",")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66900.0
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80100.0
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85700.0
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73400.0
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65500.0


In [8]:
# split data for train and evaluation
np.random.seed(seed=1) #makes result reproducible
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
de_eval = df[~msk]

## Define features
TensorFlow requires the features to be explicitly defined based on columns available in the dataframe. There is no need to remove unused columns from the dataframe.

### Add more features to the dataframe
Analyze the data and figure out what new features to add

In [9]:
def add_more_features(df):
    df['avg_rooms_per_house'] = df['total_rooms'] / df['households'] 
    df['avg_persons_per_room'] = df['population'] / df['total_rooms'] 
    return df

### Define list of feature columns
TensorFlow requires the type of features to be explicitly specified using `tf.feature_column` API in a list.

In [10]:
def create_feature_cols():
    return [tf.feature_column.numeric_column('housing_median_age'),
            tf.feature_column.numeric_column('avg_rooms_per_house'),
            tf.feature_column.numeric_column('avg_persons_per_room'),
            tf.feature_column.numeric_column('median_income'),
            tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'),
                                                boundaries = np.arange(32.0, 42, 1).tolist())]

## Make input functions
The input function not only provision data, but also dictate how the data to be supplied.

In [11]:
# Create pandas input function, works in TensorFlow 1.x
def make_input_fn(df, num_epochs):
  return tf.compact.v1.estimator.inputs.pandas_input_fn(  # have to specify .compact.v1
    x = add_more_features(df),
    y = df['median_house_value'] / 100000, # will talk about why later in the course
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

# in TensorFlow 2.x


In [9]:
# Define your feature columns
def create_feature_cols():
  return [
    tf.feature_column.numeric_column('housing_median_age'),
    tf.feature_column.bucketized_column(tf.feature_column.numeric_column('latitude'), boundaries = np.arange(32.0, 42, 1).tolist()),
    tf.feature_column.numeric_column('avg_rooms_per_house'),
    tf.feature_column.numeric_column('avg_persons_per_room'),
    tf.feature_column.numeric_column('median_income')
  ]

In [10]:
# Create estimator train and evaluate function
def train_and_evaluate(output_dir, num_train_steps):
  estimator = tf.estimator.LinearRegressor(model_dir = output_dir, feature_columns = create_feature_cols())
  train_spec = tf.estimator.TrainSpec(input_fn = make_input_fn(traindf, None), 
                                      max_steps = num_train_steps)
  eval_spec = tf.estimator.EvalSpec(input_fn = make_input_fn(evaldf, 1), 
                                    steps = None, 
                                    start_delay_secs = 1, # start evaluating after N seconds, 
                                    throttle_secs = 5)  # evaluate every N seconds
  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

In [11]:
# Launch tensorboard
OUTDIR = './trained_model'

In [12]:
# Run the model
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
tf.summary.FileWriterCache.clear() # ensure filewriter cache is clear for TensorBoard events file
train_and_evaluate(OUTDIR, 2000)

I1106 19:45:49.214785 140356028634944 estimator.py:1790] Using default config.
I1106 19:45:49.217272 140356028634944 estimator.py:209] Using config: {'_model_dir': './trained_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa6e2749160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
A value is tryin

I1106 19:45:57.410668 140356028634944 basic_session_run_hooks.py:260] loss = 153.61343, step = 1001 (0.396 sec)
I1106 19:45:57.808362 140356028634944 basic_session_run_hooks.py:692] global_step/sec: 249.357
I1106 19:45:57.811183 140356028634944 basic_session_run_hooks.py:260] loss = 123.540054, step = 1101 (0.401 sec)
W1106 19:45:58.162163 140356028634944 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 1186 vs previous value: 1186. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.
I1106 19:45:58.224290 140356028634944 basic_session_run_hooks.py:692] global_step/sec: 240.429
I1106 19:45:58.225600 140356028634944 basic_session_run_hooks.py:260] loss = 62.471573, step = 1201 (0.414 sec)
I1106 19:45:58.620387 140356028634944 basic_session_run_hooks.py:692] global_step/sec: 252.458
I1106 19:45:58.622113 140356028634944 basic