# Classify structured data with feature columns

https://www.tensorflow.org/tutorials/structured_data/feature_columns

## Setup

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [3]:
# Setup memory to fix critical issue
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [4]:
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Use Pandas to create a dataframe

In [6]:
import pathlib
import csv
import sys

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
dataframe = pd.read_csv(csv_file)

In [7]:
dataframe.columns

Index(['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'Description',
       'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [8]:
for clm in dataframe.columns:
    print(dataframe[clm].max())

Dog
255
Yorkshire Terrier Yorkie
Male
Yellow
Yellow
Small
Short
Yes
Yes
Serious Injury
2000


TypeError: '>=' not supported between instances of 'str' and 'float'

In [None]:
dataframe.head()

In [None]:
dataframe.dtypes

In [None]:
dataframe['Age']=dataframe['Age'].astype(np.int32)
dataframe['Fee']=dataframe['Fee'].astype(np.int32)
dataframe['PhotoAmt']=dataframe['PhotoAmt'].astype(np.int32)
dataframe['AdoptionSpeed']=dataframe['AdoptionSpeed'].astype(np.int32)

In [9]:
dataframe.dtypes

Type             object
Age               int64
Breed1           object
Gender           object
Color1           object
Color2           object
MaturitySize     object
FurLength        object
Vaccinated       object
Sterilized       object
Health           object
Fee               int64
Description      object
PhotoAmt          int64
AdoptionSpeed     int64
dtype: object

In [10]:
my_array=np.array([[1,2,3],[1,5,6],[1,5,7]])

df = pd.DataFrame(my_array, columns = ['Column_A','Column_B','Column_C'])

In [11]:
np.where(df['Column_A']==1, 0, 1)

array([0, 0, 0])

### Create target variable

In [12]:
# In the original dataset "4" indicates the pet was not adopted.
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4, 0, 1)

# Drop un-used columns.
dataframe = dataframe.drop(columns=['AdoptionSpeed', 'Description'])

### Split the dataframe into train, validation, and test

In [13]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


### Create an input pipeline using tf.data

In [14]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size = len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [15]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size = batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

### Understand the input pipeline

In [16]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['Age'])
    print('A batch of targets:', label_batch)

Every feature: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
A batch of ages: tf.Tensor([ 2  4  1  2 84], shape=(5,), dtype=int64)
A batch of targets: tf.Tensor([1 1 1 0 1], shape=(5,), dtype=int32)


### Demonstrate several types of feature columns

In [17]:
example_batch = next(iter(train_ds))[0]

In [18]:
next(iter(train_ds))[0]

{'Type': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Dog', b'Cat', b'Cat', b'Dog', b'Dog'], dtype=object)>,
 'Age': <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 1,  2,  2,  1, 12], dtype=int64)>,
 'Breed1': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Mixed Breed', b'Domestic Short Hair', b'Domestic Medium Hair',
        b'Mixed Breed', b'Mixed Breed'], dtype=object)>,
 'Gender': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Female', b'Male', b'Male', b'Male', b'Female'], dtype=object)>,
 'Color1': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Black', b'Black', b'Black', b'Black', b'Black'], dtype=object)>,
 'Color2': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Brown', b'White', b'White', b'Brown', b'No Color'], dtype=object)>,
 'MaturitySize': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Medium', b'Medium', b'Small', b'Small', b'Medium'], dtype=object)>,
 'FurLength': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Long', b'

In [19]:
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

### Numeric columns

In [20]:
photo_count = feature_column.numeric_column('PhotoAmt')

### Bucketized columns

In [21]:
age = feature_column.numeric_column('Age')
print(age)
age_buckets = feature_column.bucketized_column(age, boundaries=[1, 3, 5])
print(demo(age_buckets))

NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)
[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]
None


### Categorical columns

In [22]:
animal_type = feature_column.categorical_column_with_vocabulary_list('Type', ['Cat', 'Dog'])
animal_type_one_hot = feature_column.indicator_column(animal_type)
demo(animal_type_one_hot)

[[1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]


In [23]:
animal_type.dtype

tf.string

### Embedding columns

ATTENTION: it requires conda env in windows to avoid crash

In [24]:
breed1 = feature_column.categorical_column_with_vocabulary_list('Breed1', dataframe.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
demo(breed1_embedding)

[[ 0.4344645   0.22324537  0.22143054 -0.07429672 -0.4814776  -0.5911334
  -0.13444236  0.12345546]
 [-0.01660107  0.09167581 -0.3108774   0.15297621 -0.47395968 -0.2548532
   0.24260052  0.3221438 ]
 [-0.21155676 -0.21912909  0.6234966   0.2604484   0.05350488  0.01931564
   0.20494556  0.11478563]
 [-0.01660107  0.09167581 -0.3108774   0.15297621 -0.47395968 -0.2548532
   0.24260052  0.3221438 ]
 [-0.01660107  0.09167581 -0.3108774   0.15297621 -0.47395968 -0.2548532
   0.24260052  0.3221438 ]]


### Hashed feature columns

In [25]:
breed1_hashed = feature_column.categorical_column_with_hash_bucket('Breed1', hash_bucket_size = 10)
demo(feature_column.indicator_column(breed1_hashed))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


### Crossed feature columns

In [26]:
# OverflowError: Python int too large to convert to C long
crossed_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=1)
layers.DenseFeatures(feature_column.indicator_column(crossed_feature))

<tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures at 0x15629328550>

### Choose which columns to use

In [33]:
feature_columns = []

for header in ['PhotoAmt', 'Fee', 'Age']:
    feature_columns.append(feature_column.numeric_column(header))

In [34]:
# bucketized cols
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries=[1,2,3,4,5])
feature_columns.append(age_buckets)

In [35]:
# indicator_columns
indicator_column_names = ['Type', 'Color1', 'Color2', 'Gender', 'MaturitySize',
                          'FurLength', 'Vaccinated', 'Sterilized', 'Health']
for col_name in indicator_column_names:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, dataframe[col_name].unique())
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)

In [36]:
# embedding columns
breed1 = feature_column.categorical_column_with_vocabulary_list(
      'Breed1', dataframe.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
feature_columns.append(breed1_embedding)

In [37]:
# crossed columns
age_type_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size=100)
feature_columns.append(feature_column.indicator_column(age_type_feature))

### Create a feature layer

In [38]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [39]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

## Create and train model

In [40]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Epoch 1/10
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x15629318c70>

In [41]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7409012317657471
