# Feature Columns

In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import seaborn as sn
import tensorflow as tf

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
from tensorflow.data import Dataset
from sklearn import metrics

sn.set()
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

  return f(*args, **kwds)


In [2]:
cf_housing_df = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')
cf_housing_df['median_house_value'] = cf_housing_df['median_house_value'] / 1000.0
cf_housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66.9
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80.1
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85.7
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73.4
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65.5


## Feature Cross

In [10]:
def get_quantile_based_boundaries(feature_values, num_buckets):
    boundaries = np.arange(1.0, num_buckets) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    return [quantiles[q] for q in quantiles.keys()]

In [11]:
longitude = tf.feature_column.numeric_column("longitude")
latitude = tf.feature_column.numeric_column("latitude")

# Divide longitude into 10 buckets
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=get_quantile_based_boundaries(
        cf_housing_df["longitude"], 10))

# Divide latitude into 10 buckets
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=get_quantile_based_boundaries(
        cf_housing_df['latitude'], 10))

### 1) bucketized_longitude

In [15]:
feature_columns = [bucketized_longitude]
input_layer = tf.feature_column.input_layer(
    features={'longitude': [-123.3]},
    feature_columns=feature_columns)

In [16]:
with tf.Session() as sess:
    inputs = input_layer.eval()

print(inputs)
print(bucketized_longitude.boundaries)

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
(-122.28, -121.98, -121.36000000000001, -119.87, -118.49, -118.3, -118.12, -117.88, -117.24)


### 2) bucketized_latitude

In [95]:
feature_columns = [bucketized_latitude]
input_layer = tf.feature_column.input_layer(
    features={'latitude': [33]},
    feature_columns=feature_columns)

In [96]:
with tf.Session() as sess:
    inputs = input_layer.eval()

print(inputs)
print(bucketized_latitude.boundaries)

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
(33.62, 33.86, 34.0, 34.09, 34.25, 36.64, 37.47, 37.81, 38.48)


### 3) long_x_lat - crossed feature

`one-hot index = feature-cross % hash_bucket_size` 

In [137]:
# feature cross
long_x_lat = tf.feature_column.crossed_column(
    [bucketized_longitude, bucketized_latitude], 10*10)

# crossed feature -> one-hot(indicator column)
embed_long_x_lat = tf.feature_column.indicator_column(long_x_lat)

In [138]:
feature_columns = [embed_long_x_lat]

input_layer = tf.feature_column.input_layer(
    features={'longitude': [-130], 'latitude': [10]},
    feature_columns=feature_columns)

In [139]:
with tf.Session() as sess:
    inputs = input_layer.eval()
    
inputs

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)

## California House

In [140]:
households = tf.feature_column.numeric_column("households")
longitude = tf.feature_column.numeric_column("longitude")
latitude = tf.feature_column.numeric_column("latitude")
housing_median_age = tf.feature_column.numeric_column("housing_median_age")
median_income = tf.feature_column.numeric_column("median_income")


# Divide households into 7 buckets.
bucketized_households = tf.feature_column.bucketized_column(
    households, boundaries=get_quantile_based_boundaries(
      training_examples["households"], 7))

# Divide longitude into 10 buckets.
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=get_quantile_based_boundaries(
      training_examples["longitude"], 10))

# Divide latitude into 10 buckets.
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=get_quantile_based_boundaries(
      training_examples["latitude"], 10))

# Divide housing_median_age into 7 buckets.
bucketized_housing_median_age = tf.feature_column.bucketized_column(
housing_median_age, boundaries=get_quantile_based_boundaries(
  training_examples["housing_median_age"], 7))

# Divide median_income into 7 buckets.
bucketized_median_income = tf.feature_column.bucketized_column(
    median_income, boundaries=get_quantile_based_boundaries(
      training_examples["median_income"], 7))

long_x_lat = tf.feature_column.crossed_column(
    set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=10*10) 

In [141]:
# crossed feature -> one-hot(indicator column)
embed_long_x_lat = tf.feature_column.indicator_column(long_x_lat)

In [73]:
feature_columns = set([
    bucketized_longitude,  # 10
    bucketized_latitude,  # 10
    bucketized_housing_median_age,  # 7
    bucketized_households,  # 7
    bucketized_median_income,  # 7
    embed_long_x_lat])  # 100  total: 141

In [74]:
features = {
    'longitude': [-130], 
    'latitude': [10], 
    'housing_median_age': [12],
    'households': [190],
    'median_income': [2]
}

In [75]:
input_layer = tf.feature_column.input_layer(
    features=features,
    feature_columns=feature_columns)

In [76]:
with tf.Session() as sess:
    inputs = input_layer.eval()
    
inputs

array([[1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [77]:
inputs.shape

(1, 141)

## Feature Column in Low-Level API

In [6]:
cf_housing_df = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv')
cf_housing_df['median_house_value'] = cf_housing_df['median_house_value'] / 1000.0
cf_housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.3,34.2,15.0,5612.0,1283.0,1015.0,472.0,1.5,66.9
1,-114.5,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.8,80.1
2,-114.6,33.7,17.0,720.0,174.0,333.0,117.0,1.7,85.7
3,-114.6,33.6,14.0,1501.0,337.0,515.0,226.0,3.2,73.4
4,-114.6,33.6,20.0,1454.0,326.0,624.0,262.0,1.9,65.5


In [8]:
columns = cf_housing_df.columns.tolist()[:-1]

for column in columns:
    cf_housing_df[column] = cf_housing_df[column].apply(lambda row: [row])
    
cf_housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,[-114.31],[34.19],[15.0],[5612.0],[1283.0],[1015.0],[472.0],[1.4936],66.9
1,[-114.47],[34.4],[19.0],[7650.0],[1901.0],[1129.0],[463.0],[1.82],80.1
2,[-114.56],[33.69],[17.0],[720.0],[174.0],[333.0],[117.0],[1.6509],85.7
3,[-114.57],[33.64],[14.0],[1501.0],[337.0],[515.0],[226.0],[3.1917],73.4
4,[-114.57],[33.57],[20.0],[1454.0],[326.0],[624.0],[262.0],[1.925],65.5


In [39]:
train_x, train_y = cf_housing_df[:12000], cf_housing_df['median_house_value'][:12000]
test_x, test_y = cf_housing_df[12000:], cf_housing_df['median_house_value'][12000:]
train_y, test_y = train_y.values.reshape(-1, 1), test_y.values.reshape(-1, 1)

In [13]:
def get_quantile_based_boundaries(feature_values, num_buckets):
    boundaries = np.arange(1.0, num_buckets) / num_buckets
    quantiles = feature_values.quantile(boundaries)
    return [quantiles[q] for q in quantiles.keys()]

In [20]:
longitude = tf.feature_column.numeric_column("longitude")
latitude = tf.feature_column.numeric_column("latitude")
housing_median_age = tf.feature_column.numeric_column("housing_median_age")
households = tf.feature_column.numeric_column("households")
median_income = tf.feature_column.numeric_column("median_income")


# Divide longitude into 10 buckets.
bucketized_longitude = tf.feature_column.bucketized_column(
    longitude, boundaries=get_quantile_based_boundaries(
      train_x["longitude"].apply(lambda x: x[0]), 10))

# Divide latitude into 10 buckets.
bucketized_latitude = tf.feature_column.bucketized_column(
    latitude, boundaries=get_quantile_based_boundaries(
      train_x["latitude"].apply(lambda x: x[0]), 10))

# Divide housing_median_age into 7 buckets.
bucketized_housing_median_age = tf.feature_column.bucketized_column(
    housing_median_age, boundaries=get_quantile_based_boundaries(
      train_x["housing_median_age"].apply(lambda x: x[0]), 7))

# Divide households into 7 buckets.
bucketized_households = tf.feature_column.bucketized_column(
    households, boundaries=get_quantile_based_boundaries(
      train_x["households"].apply(lambda x: x[0]), 7))

# Divide median_income into 7 buckets.
bucketized_median_income = tf.feature_column.bucketized_column(
    median_income, boundaries=get_quantile_based_boundaries(
      train_x["median_income"].apply(lambda x: x[0]), 7))

# Feature Cross: longitude x latitude
long_x_lat = tf.feature_column.crossed_column(
    set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=10*10)

# crossed feature -> one-hot(indicator column)
embed_long_x_lat = tf.feature_column.indicator_column(long_x_lat)

In [24]:
features = {
    'longitude': train_x['longitude'].tolist(),
    'latitude': train_x['latitude'].tolist(),
    'housing_median_age': train_x['housing_median_age'].tolist(),
    'households': train_x['households'].tolist(),
    'median_income': train_x['median_income'].tolist() 
}

feature_columns = set([
    bucketized_longitude,  # 10
    bucketized_latitude,  # 10
    bucketized_housing_median_age,  # 7
    bucketized_households,  # 7
    bucketized_median_income,  # 7
    embed_long_x_lat])  # 100  total: 142

In [30]:
inputs = tf.feature_column.input_layer(
    features=features,
    feature_columns=feature_columns)

labels = tf.placeholder(tf.float32, [12000, 1])

In [69]:
W = tf.Variable(
        tf.truncated_normal([141, 1], stddev=0.1))
b = tf.Variable(tf.zeros([1]))

predictions = tf.matmul(inputs, W) + b

# predictions = tf.layers.dense(inputs, 1, 
#                              kernel_initializer=tf.truncated_normal_initializer(stddev=0.1))

# loss & optimizer
rmse = tf.sqrt(
        tf.losses.mean_squared_error(labels=labels, predictions=predictions))
train_op = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(rmse)

In [81]:
num_epoch = 300

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epoch):
        loss, _ = sess.run([rmse, train_op], feed_dict={labels: train_y})
        
        if (epoch+1) % 30 == 0:
            print('Epoch: {:03d}, RMSE: {:.4f}'.format(epoch+1, loss))
            
    weights = W.eval()
    bias = b.eval()

Epoch: 030, RMSE: 192.0529
Epoch: 060, RMSE: 160.3850
Epoch: 090, RMSE: 135.5469
Epoch: 120, RMSE: 118.3384
Epoch: 150, RMSE: 107.7163
Epoch: 180, RMSE: 101.3966
Epoch: 210, RMSE: 97.3458
Epoch: 240, RMSE: 94.3782
Epoch: 270, RMSE: 91.9437
Epoch: 300, RMSE: 89.8138


In [82]:
weights.shape

(141, 1)

In [83]:
bias

array([117.82721], dtype=float32)