Code Attribution: https://towardsdatascience.com/building-a-one-hot-encoding-layer-with-tensorflow-f907d686bf39

In [1]:
import numpy as np
import pandas as pd 

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
import tensorflow as tf 
from tensorflow.keras import layers, models

In [4]:
#First create a default OnehotEncoder to use a reference
#Simple dataframe created
#Then use the sklearn one hot encoder
colors_df = pd.DataFrame(data=[['red'],['blue'],['green'],['blue']], columns=['color'])
print('No One Hot Encoding: ')
print(colors_df.head())

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(colors_df)

#The ref encoded will serve as the reference
ref_encoded_df = one_hot_encoder.transform(colors_df)
ref_encoded_df = pd.DataFrame(data=ref_encoded_df, columns=one_hot_encoder.categories_)
print('One Hot Encoding: ')
ref_encoded_df.head()

No One Hot Encoding: 
   color
0    red
1   blue
2  green
3   blue
One Hot Encoding: 


Unnamed: 0,blue,green,red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0


In [5]:
#This will serve as a custom layer for One Hot Encoding Categorical Features
# adapt() fits the layer to the categorical inputs
# call() invokes the layer and One Hot Encodes the inputs based on the vocabulary learned during the call to adapt()
# get_config() returns a configuration dict that represents the state of the layer. This includes the vocabulary (e.g. ['red','green','blue']), the depth (or the number of unique categories), and the minimum index.
class OneHotEncodingLayer(layers.experimental.preprocessing.PreprocessingLayer):
    def __init__(self, vocabulary=None, depth=None, minimum=None):
        super().__init__()
        self.vectorization = layers.experimental.preprocessing.TextVectorization(output_sequence_length=1)
        
        if vocabulary:
          self.vectorization.set_vocabulary(vocabulary)
        self.depth = depth   
        self.minimum = minimum
        
    def adapt(self, data):
        self.vectorization.adapt(data)
        vocab = self.vectorization.get_vocabulary()
        self.depth = len(vocab)
        indices = [i[0] for i in self.vectorization([[v] for v in vocab]).numpy()]
        self.minimum = min(indices)
    
    def call(self,inputs):
        vectorized = self.vectorization.call(inputs)
        subtracted = tf.subtract(vectorized, tf.constant([self.minimum], dtype=tf.int64))
        encoded = tf.one_hot(subtracted, self.depth)
        return layers.Reshape((self.depth,))(encoded)
    
    def get_config(self):
        return {'vocabulary': self.vectorization.get_vocabulary(), 'depth': self.depth, 'minimum': self.minimum}

In [6]:
#Constructing a One Hot Encoded using a Neural Network

colors_df = pd.DataFrame(data=[[1,'red'],[2,'blue'],[3,'green'],[4,'blue']], columns=['id', 'color'])
categorical_input = layers.Input(shape=(1,), dtype=tf.string)
one_hot_layer = OneHotEncodingLayer()
one_hot_layer.adapt(colors_df['color'].values)
encoded = one_hot_layer(categorical_input)

numeric_input = layers.Input(shape=(1,), dtype=tf.float32)

concat = layers.concatenate([numeric_input, encoded])

In [8]:
model = models.Model(inputs=[numeric_input, categorical_input], outputs=[concat])
model.compile()
NN_encoded_predicted = model.predict([colors_df['id'], colors_df['color']])
print(NN_encoded_predicted)

[[1. 0. 0. 0. 1. 0.]
 [2. 0. 0. 1. 0. 0.]
 [3. 0. 0. 0. 0. 1.]
 [4. 0. 0. 1. 0. 0.]]


In [9]:
print(ref_encoded_df)
print(NN_encoded_predicted)

  blue green  red
0  0.0   0.0  1.0
1  1.0   0.0  0.0
2  0.0   1.0  0.0
3  1.0   0.0  0.0
[[1. 0. 0. 0. 1. 0.]
 [2. 0. 0. 1. 0. 0.]
 [3. 0. 0. 0. 0. 1.]
 [4. 0. 0. 1. 0. 0.]]
