# Feature Columns

In [4]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import seaborn as sn
import tensorflow as tf

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from tensorflow.data import Dataset
from sklearn import metrics

sn.set()
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [5]:
cf_housing_df = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')
cf_housing_df['median_house_value'] = cf_housing_df['median_house_value'] / 1000.0
cf_housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66.9
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80.1
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85.7
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73.4
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65.5


## Feature Cross

In [6]:
def get_minmax_based_boundaries(feature_values, num_buckets):
    min_feature_values = feature_values.min()
    max_feature_values = feature_values.max()
    boundaries = np.linspace(min_feature_values, max_feature_values, num_buckets-1)
    return boundaries.tolist()

### 1) bucketized_longitude

In [9]:
longitude = tf.feature_column.numeric_column("longitude")

# Divide longitude into 10 buckets
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=get_minmax_based_boundaries(
        cf_housing_df["longitude"], 10))

In [10]:
feature_columns = [bucketized_longitude]
input_layer = tf.feature_column.input_layer(
    features={'longitude': [-123.3]},
    feature_columns=feature_columns)

In [11]:
with tf.Session() as sess:
    inputs = input_layer.eval()

print(inputs)
print(bucketized_longitude.boundaries)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
(-124.35, -123.095, -121.84, -120.585, -119.33, -118.075, -116.82, -115.565, -114.31)


### 2) bucketized_latitude

In [12]:
latitude = tf.feature_column.numeric_column("latitude")

# Divide latitude into 10 buckets
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=get_minmax_based_boundaries(
        cf_housing_df['latitude'], 10))

In [13]:
feature_columns = [bucketized_latitude]
input_layer = tf.feature_column.input_layer(
    features={'latitude': [33]},
    feature_columns=feature_columns)

In [14]:
with tf.Session() as sess:
    inputs = input_layer.eval()

print(inputs)
print(bucketized_latitude.boundaries)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
(32.54, 33.71625, 34.8925, 36.06875, 37.245000000000005, 38.42125, 39.597500000000004, 40.77375000000001, 41.95)


### 3) long_x_lat - crossed feature

`one-hot index = feature-cross % hash_bucket_size` 

In [23]:
# feature cross
long_x_lat = tf.feature_column.crossed_column(
    [bucketized_longitude, bucketized_latitude], 10*10)

# crossed feature -> one-hot(indicator column)
embed_long_x_lat = tf.feature_column.indicator_column(long_x_lat)

In [24]:
feature_columns = [embed_long_x_lat]

input_layer = tf.feature_column.input_layer(
    features={'longitude': [-130], 'latitude': [10]},
    feature_columns=feature_columns)

In [25]:
with tf.Session() as sess:
    inputs = input_layer.eval()
    
inputs

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)

In [26]:
inputs.shape

(1, 100)

### 4) embedding long_x_lat

In [18]:
embed_long_x_lat = tf.feature_column.embedding_column(
    long_x_lat, 32)

In [19]:
feature_columns = [embed_long_x_lat]

input_layer = tf.feature_column.input_layer(
    features={'longitude': [-130], 'latitude': [10]},
    feature_columns=feature_columns)

In [21]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    inputs = input_layer.eval()
    
inputs

array([[-0.0574039 , -0.20360339, -0.02098681,  0.23285338,  0.13189551,
        -0.17860322, -0.2739333 , -0.10702775, -0.34425932, -0.25013763,
        -0.00682505, -0.03940776,  0.09334876,  0.08902777, -0.09327023,
        -0.34497663,  0.1810162 , -0.2107506 , -0.2190317 ,  0.06281308,
        -0.0877175 ,  0.10105263,  0.24090125,  0.0499248 , -0.2150563 ,
        -0.20175344,  0.13349497,  0.13180313,  0.02145167,  0.3244529 ,
        -0.06945614,  0.28835878]], dtype=float32)

In [22]:
inputs.shape

(1, 32)

## Feature Column in Low-Level API

In [27]:
cf_housing_df = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')
cf_housing_df['median_house_value'] = cf_housing_df['median_house_value'] / 1000.0
cf_housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66.9
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80.1
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85.7
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73.4
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65.5


In [28]:
columns = cf_housing_df.columns.tolist()[:-1]

for column in columns:
    cf_housing_df[column] = cf_housing_df[column].apply(lambda row: [row])
    
cf_housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,[-114.31],[34.19],[15.0],[5612.0],[1283.0],[1015.0],[472.0],[1.4936],66.9
1,[-114.47],[34.4],[19.0],[7650.0],[1901.0],[1129.0],[463.0],[1.82],80.1
2,[-114.56],[33.69],[17.0],[720.0],[174.0],[333.0],[117.0],[1.6509],85.7
3,[-114.57],[33.64],[14.0],[1501.0],[337.0],[515.0],[226.0],[3.1917],73.4
4,[-114.57],[33.57],[20.0],[1454.0],[326.0],[624.0],[262.0],[1.925],65.5


In [29]:
train_x, train_y = cf_housing_df[:12000], cf_housing_df['median_house_value'][:12000]
test_x, test_y = cf_housing_df[12000:], cf_housing_df['median_house_value'][12000:]
train_y, test_y = train_y.values.reshape(-1, 1), test_y.values.reshape(-1, 1)

In [30]:
def get_quantile_based_boundaries(feature_values, num_buckets):
    boundaries = np.arange(1.0, num_buckets) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    return [quantiles[q] for q in quantiles.keys()]

In [31]:
longitude = tf.feature_column.numeric_column("longitude")
latitude = tf.feature_column.numeric_column("latitude")
housing_median_age = tf.feature_column.numeric_column("housing_median_age")
households = tf.feature_column.numeric_column("households")
median_income = tf.feature_column.numeric_column("median_income")


# Divide longitude into 10 buckets.
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=get_quantile_based_boundaries(
      train_x["longitude"].apply(lambda x: x[0]), 10))

# Divide latitude into 10 buckets.
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=get_quantile_based_boundaries(
      train_x["latitude"].apply(lambda x: x[0]), 10))

# Divide housing_median_age into 7 buckets.
bucketized_housing_median_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries=get_quantile_based_boundaries(
      train_x["housing_median_age"].apply(lambda x: x[0]), 7))

# Divide households into 7 buckets.
bucketized_households = tf.feature_column.bucketized_column(
    households, boundaries=get_quantile_based_boundaries(
      train_x["households"].apply(lambda x: x[0]), 7))

# Divide median_income into 7 buckets.
bucketized_median_income = tf.feature_column.bucketized_column(
    median_income, boundaries=get_quantile_based_boundaries(
      train_x["median_income"].apply(lambda x: x[0]), 7))

# Feature Cross: longitude x latitude
long_x_lat = tf.feature_column.crossed_column(
    set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=10*10)

# crossed feature -> one-hot(indicator column)
embed_long_x_lat = tf.feature_column.embedding_column(
    long_x_lat, 32)

In [32]:
features = {
    'longitude': train_x['longitude'].tolist(),
    'latitude': train_x['latitude'].tolist(),
    'housing_median_age': train_x['housing_median_age'].tolist(),
    'households': train_x['households'].tolist(),
    'median_income': train_x['median_income'].tolist() 
}

feature_columns = set([
    bucketized_longitude,  # 10
    bucketized_latitude,  # 10
    bucketized_housing_median_age,  # 7
    bucketized_households,  # 7
    bucketized_median_income])  # 7 total: 41

# embedding column
features_emb = {
    'longitude': train_x['longitude'].tolist(),
    'latitude': train_x['latitude'].tolist()
}

feature_columns_emb = [
    embed_long_x_lat
]

In [33]:
inputs = tf.feature_column.input_layer(
    features=features,
    feature_columns=feature_columns)

inputs_emb = tf.feature_column.input_layer(
    features=features_emb,
    feature_columns=feature_columns_emb)

labels = tf.placeholder(tf.float32, [12000, 1])

In [34]:
W = tf.Variable(
        tf.truncated_normal([41, 1], stddev=0.1))
b = tf.Variable(tf.zeros([1]))

W_emb = tf.Variable(
            tf.truncated_normal([32, 1], stddev=0.1))
b_emb = tf.Variable(tf.zeros([1]))

predictions = (tf.matmul(inputs, W) + b) + (tf.matmul(inputs_emb, W_emb) + b_emb)

# predictions = tf.layers.dense(inputs, 1, 
#                              kernel_initializer=tf.truncated_normal_initializer(stddev=0.1))

# loss & optimizer
rmse = tf.sqrt(
        tf.losses.mean_squared_error(labels=labels, predictions=predictions))
train_op = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(rmse)

In [35]:
num_epoch = 300

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epoch):
        loss, _ = sess.run([rmse, train_op], feed_dict={labels: train_y})
        
        if (epoch+1) % 30 == 0:
            print('Epoch: {:03d}, RMSE: {:.4f}'.format(epoch+1, loss))
            
    weights = W.eval()
    bias = b.eval()

Epoch: 030, RMSE: 115.3691
Epoch: 060, RMSE: 83.3975
Epoch: 090, RMSE: 81.2736
Epoch: 120, RMSE: 79.7571
Epoch: 150, RMSE: 78.4202
Epoch: 180, RMSE: 77.2000
Epoch: 210, RMSE: 76.0732
Epoch: 240, RMSE: 75.0282
Epoch: 270, RMSE: 74.0577
Epoch: 300, RMSE: 73.1566


In [36]:
weights.shape

(41, 1)

In [37]:
bias

array([25.55541], dtype=float32)

In [39]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    print(inputs_emb.eval())

[[-0.21446897  0.13240804 -0.01478696 ...  0.0242057   0.11872536
  -0.1384667 ]
 [ 0.11912717  0.28215304 -0.13007393 ...  0.23899429  0.03874798
  -0.01086585]
 [-0.03113198  0.2635985  -0.08466335 ... -0.22871529  0.11032477
  -0.00480296]
 ...
 [ 0.26749125 -0.06571634  0.04438065 ... -0.2642599   0.07323352
   0.19872226]
 [ 0.26749125 -0.06571634  0.04438065 ... -0.2642599   0.07323352
   0.19872226]
 [ 0.26749125 -0.06571634  0.04438065 ... -0.2642599   0.07323352
   0.19872226]]
