Categoryencoding (#16)

* needs linting * started category encoding layer * fixed kernal mistake * CategoryEncoding layer definition and export finished. Needs unit testing * CategoryEncoding layer definition and export finished. Needs unit testing * unit tests for category encoding * more unit testing * passing all unit tests for category encoding * done with category encoding, working on linting * passing linter
CodeSmithDSMLProjects · Sep 24, 2022 · 9f29a29 · 9f29a29
1 parent aa5338f
commit 9f29a29
Show file tree

Hide file tree

Showing 7 changed files with 446 additions and 4 deletions.
diff --git a/tfjs-layers/src/exports_layers.ts b/tfjs-layers/src/exports_layers.ts
@@ -25,7 +25,7 @@ import {AveragePooling1D, AveragePooling2D, AveragePooling3D, GlobalAveragePooli
 import {GRU, GRUCell, GRUCellLayerArgs, GRULayerArgs, LSTM, LSTMCell, LSTMCellLayerArgs, LSTMLayerArgs, RNN, RNNCell, RNNLayerArgs, SimpleRNN, SimpleRNNCell, SimpleRNNCellLayerArgs, SimpleRNNLayerArgs, StackedRNNCells, StackedRNNCellsArgs} from './layers/recurrent';
 import {Bidirectional, BidirectionalLayerArgs, TimeDistributed, WrapperLayerArgs} from './layers/wrappers';
 import { Rescaling, RescalingArgs } from './layers/preprocessing/image_preprocessing';
-
+import { CategoryEncoding, CategoryEncodingArgs } from './layers/preprocessing/category_encoding';
 // TODO(cais): Add doc string to all the public static functions in this
 //   class; include exectuable JavaScript code snippets where applicable
 //   (b/74074458).
@@ -1729,3 +1729,44 @@ export function masking(args?: MaskingArgs) {
 export function rescaling(args?: RescalingArgs) {
   return new Rescaling(args);
 }
+
+/**
+ * A preprocessing layer which encodes integer features.
+ *
+ * This layer provides options for condensing data into a categorical encoding
+ * when the total number of tokens are known in advance. It accepts integer
+ * values as inputs, and it outputs a dense or sparse representation of those
+ * inputs.
+ *
+ * Arguments:
+ *
+ * numTokens: The total number of tokens the layer should support. All
+ *  inputs to the layer must integers in the range `0 <= value <
+ *  num_tokens`, or an error will be thrown.
+ *
+ * outputMode: Specification for the output of the layer.
+ *  Defaults to `"multiHot". Values can be "oneHot", "multiHot" or
+ *  "count", configuring the layer as follows:
+ *
+ *    oneHot: Encodes each individual element in the input into an
+ *      array of numTokens size, containing a 1 at the element index. If
+ *      the last dimension is size 1, will encode on that dimension. If the
+ *      last dimension is not size 1, will append a new dimension for the
+ *      encoded output.
+ *
+ *    multiHot: Encodes each sample in the input into a single array
+ *     of `num_tokens` size, containing a 1 for each vocabulary term
+ *     present in the sample. Treats the last dimension as the sample
+ *     dimension, if input shape is `(..., sample_length)`, output shape
+ *     will be `(..., numTokens)`.
+ *
+ *    count: Like "multiHot", but the int array contains a count of
+ *     the number of times the token at that index appeared in the sample.
+ *
+ *  For all output modes, currently only output up to rank 2 is supported.
+ *
+ * @doc {heading: 'Layers', subheading: 'CategoryEncoding', namespace: 'layers'}
+ */
+export function categoryEncoding(args: CategoryEncodingArgs) {
+  return new CategoryEncoding(args);
+}
diff --git a/tfjs-layers/src/layers/preprocessing/category_encoding.ts b/tfjs-layers/src/layers/preprocessing/category_encoding.ts
@@ -0,0 +1,114 @@
+/**
+ * @license
+ * Copyright 2022 CodeSmith LLC
+ *
+ * Use of this source code is governed by an MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT.
+ * =============================================================================
+ */
+
+import { LayerArgs, Layer } from '../../engine/topology';
+import { serialization, Tensor, tidy, Tensor1D, Tensor2D} from '@tensorflow/tfjs-core';
+import { greater, greaterEqual, max, min} from '@tensorflow/tfjs-core';
+import { Shape } from '../../keras_format/common';
+import { getExactlyOneShape, getExactlyOneTensor } from '../../utils/types_utils';
+import { Kwargs } from '../../types';
+import { ValueError } from '../../errors';
+import * as K from '../../backend/tfjs_backend';
+import * as utils from './preprocessing_utils';
+
+export declare interface CategoryEncodingArgs extends LayerArgs {
+  numTokens: number;
+  outputMode?: string;
+ }
+
+export class CategoryEncoding extends Layer {
+  /** @nocollapse */
+  static className = 'CategoryEncoding';
+  private readonly numTokens: number;
+  private readonly outputMode: string;
+
+  constructor(args: CategoryEncodingArgs) {
+    super(args);
+    this.numTokens = args.numTokens;
+
+    if(args.outputMode) {
+    this.outputMode = args.outputMode;
+    } else {
+      this.outputMode = utils.multiHot;
+    }
+  }
+
+  getConfig(): serialization.ConfigDict {
+    const config: serialization.ConfigDict = {
+      'numTokens': this.numTokens,
+      'outputMode': this.outputMode,
+    };
+
+    const baseConfig = super.getConfig();
+    Object.assign(config, baseConfig);
+    return config;
+  }
+
+  computeOutputShape(inputShape: Shape|Shape[]): Shape|Shape[] {
+    inputShape = getExactlyOneShape(inputShape);
+
+    if(inputShape == null) {
+      return [this.numTokens];
+    }
+
+    if(this.outputMode === utils.oneHot && inputShape[-1] !== 1) {
+      inputShape.push(this.numTokens);
+      return inputShape;
+    }
+
+    inputShape[-1] = this.numTokens;
+    return inputShape;
+  }
+
+  call(inputs: Tensor|Tensor[], kwargs: Kwargs): Tensor[]|Tensor {
+    return tidy(() => {
+
+      inputs = getExactlyOneTensor(inputs);
+      if(inputs.dtype !== 'int32') {
+        inputs = K.cast(inputs, 'int32');
+    }
+
+       let countWeights;
+
+      if((typeof kwargs['countWeights']) !== 'undefined') {
+
+        if(this.outputMode !== utils.count) {
+          throw new ValueError(
+            `countWeights is not used when outputMode !== count.
+             Received countWeights=${kwargs['countWeights']}`);
+        }
+         const countWeightsRanked = getExactlyOneTensor(kwargs['countWeights']);
+
+         if(countWeightsRanked.rank === 1) {
+           countWeights = countWeightsRanked as Tensor1D;
+         } if(countWeightsRanked.rank === 2) {
+           countWeights = countWeightsRanked as Tensor2D;
+          }
+      }
+
+      const depth = this.numTokens;
+      const maxValue = max(inputs);
+      const minValue = min(inputs);
+
+      const greaterEqualMax = greater(depth, maxValue).bufferSync().get(0);
+      const greaterMin = greaterEqual(minValue, 0).bufferSync().get(0);
+
+      if(!(greaterEqualMax && greaterMin)) {
+        throw new ValueError(
+        `Input values must be between 0 < values <= numTokens`);
+      }
+
+    return utils.encodeCategoricalInputs(inputs,
+      this.outputMode, depth, countWeights);
+    });
+  }
+}
+
+serialization.registerClass(CategoryEncoding);
diff --git a/tfjs-layers/src/layers/preprocessing/category_encoding_test.ts b/tfjs-layers/src/layers/preprocessing/category_encoding_test.ts
@@ -0,0 +1,141 @@
+import { describeMathCPUAndGPU, expectTensorsClose} from '../../utils/test_utils';
+import { Tensor, tensor} from '@tensorflow/tfjs-core';
+import { CategoryEncoding } from './category_encoding';
+import * as utils from './preprocessing_utils';
+
+describeMathCPUAndGPU('Layer Output', () => {
+
+   it('Calculates correct output for Count outputMode rank 0', () => {
+    const categoryData = tensor(0);
+    const expectedOutput = tensor([1,0,0,0]);
+    const numTokens = 4;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                outputMode: utils.count});
+    const computedOutput = encodingLayer.
+                          apply(categoryData) as Tensor;
+
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output for Count outputMode rank 1 (weights)', () => {
+    const categoryData = tensor([1, 2, 3, 3, 0]);
+    const weightData = tensor([1, 2, 3, 1, 7]);
+    const numTokens = 6;
+    const expectedOutput = tensor([7, 1, 2, 4, 0, 0]);
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                            outputMode: utils.count});
+
+    const computedOutput = encodingLayer.apply(categoryData,
+                      {countWeights: weightData}) as Tensor;
+
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output for Count outputMode rank 2', () => {
+    const categoryData   = tensor([[1, 2, 3, 1], [0, 3, 1, 0]]);
+    const expectedOutput = tensor([[0, 2, 1, 1, 0, 0], [2, 1, 0, 1, 0, 0]]);
+    const numTokens = 6;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                            outputMode: utils.count});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output for oneHot outputMode rank 0', () => {
+    const categoryData = tensor(3);
+    const expectedOutput = tensor([0, 0, 0, 1]);
+    const numTokens = 4;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                          outputMode: utils.oneHot});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output and shape for oneHot outputMode rank 1', () => {
+    const categoryData   = tensor([3, 2, 0, 1]);
+    const expectedOutput = tensor([[0, 0, 0, 1],
+                                   [0, 0, 1, 0],
+                                   [1, 0, 0, 0],
+                                   [0, 1, 0, 0]]);
+    const numTokens = 4;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                          outputMode: utils.oneHot});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output and shape for oneHot outputMode rank 2', () => {
+    const categoryData   = tensor([[3], [2], [0], [1]]);
+    const expectedOutput = tensor([[0, 0, 0, 1],
+                                   [0, 0, 1, 0],
+                                   [1, 0, 0, 0],
+                                   [0, 1, 0, 0]]);
+    const numTokens = 4;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                          outputMode: utils.oneHot});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output for multiHot outputMode rank 0', () => {
+    const categoryData = tensor(3);
+    const expectedOutput = tensor([0, 0, 0, 1, 0, 0]);
+    const numTokens = 6;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                outputMode: utils.oneHot});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output for multiHot outputMode rank 1', () => {
+    const categoryData   = tensor([3, 2, 0, 1]);
+    const expectedOutput = tensor([1, 1, 1, 1, 0, 0]);
+    const numTokens = 6;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                        outputMode: utils.multiHot});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Calculates correct output for multiHot outputMode rank 2', () => {
+    const categoryData   = tensor([[0, 1], [0, 0], [1, 2], [3, 1]]);
+    const expectedOutput = tensor([[1, 1, 0, 0],
+                                   [1, 0, 0, 0],
+                                   [0, 1, 1, 0],
+                                   [0, 1, 0, 1]]);
+    const numTokens = 4;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                        outputMode: utils.multiHot});
+    const computedOutput = encodingLayer.apply(categoryData) as Tensor;
+    expectTensorsClose(computedOutput, expectedOutput);
+  });
+
+  it('Raises Value Error if input Tensor has Rank > 2', () =>{
+    const categoryData = tensor([[[1], [2]], [[3], [4]]]);
+    const numTokens = 6;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                        outputMode: utils.multiHot});
+    expect(() => encodingLayer.apply(categoryData))
+    .toThrowError(`When outputMode is not 'int', maximum output rank is 2
+    Received outputMode ${utils.multiHot} and input shape ${categoryData.shape}
+    which would result in output rank ${categoryData.rank}.`);
+  });
+
+  it('Raises Value Error if max input value !<= numTokens', () => {
+    const categoryData   = tensor([7, 2, 0, 1]);
+    const numTokens = 3;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                        outputMode: utils.multiHot});
+    expect(() => encodingLayer.apply(categoryData))
+    .toThrowError(`Input values must be between 0 < values <= numTokens`);
+  });
+
+  it('Raises Value Error if min input value < 0', () => {
+    const categoryData   = tensor([7, 2, -1, 1]);
+    const numTokens = 3;
+    const encodingLayer = new CategoryEncoding({numTokens,
+                                        outputMode: utils.multiHot});
+    expect(() => encodingLayer.apply(categoryData))
+    .toThrowError(`Input values must be between 0 < values <= numTokens`);
+  });
+});
diff --git a/tfjs-layers/src/layers/preprocessing/image_preprocessing.ts b/tfjs-layers/src/layers/preprocessing/image_preprocessing.ts
@@ -24,6 +24,7 @@ export declare interface RescalingArgs extends LayerArgs {
  *
  * This rescales images by a scaling and offset factor
  */
+
 export class Rescaling extends Layer {
   /** @nocollapse */
   static className = 'Rescaling';

diff --git a/tfjs-layers/src/layers/preprocessing/image_preprocessing_test.ts b/tfjs-layers/src/layers/preprocessing/image_preprocessing_test.ts
@@ -31,16 +31,16 @@ describeMathCPUAndGPU('Rescaling Layer', () => {
     const expectedOutputTensor = add(mul(intTensor, scale), offset);
     const scalingLayer = new Rescaling({scale, offset});
     const outputTensor = scalingLayer.apply(intTensor) as Tensor;
-    expect(outputTensor.dtype).toBe('float32'); 
-    expectTensorsClose(outputTensor, expectedOutputTensor); 
+    expect(outputTensor.dtype).toBe('float32');
+    expectTensorsClose(outputTensor, expectedOutputTensor);
   });
 
   it('Config holds correct name', () => {
     const scale = 1.0 / 127.5;
     const offset = -1.0;
     const scalingLayer = new Rescaling({scale, offset, name: 'Rescaling'});
     const config = scalingLayer.getConfig();
-    expect(config.name).toEqual('Rescaling'); 
+    expect(config.name).toEqual('Rescaling');
   });
 
 });