Skip to content

Commit

Permalink
Categoryencoding (#16)
Browse files Browse the repository at this point in the history
* needs linting

* started category encoding layer

* fixed kernal mistake

* CategoryEncoding layer definition and export finished. Needs unit testing

* CategoryEncoding layer definition and export finished. Needs unit testing

* unit tests for category encoding

* more unit testing

* passing all unit tests for category encoding

* done with category encoding, working on linting

* passing linter
  • Loading branch information
AdamLang96 committed Sep 24, 2022
1 parent aa5338f commit 9f29a29
Show file tree
Hide file tree
Showing 7 changed files with 446 additions and 4 deletions.
43 changes: 42 additions & 1 deletion tfjs-layers/src/exports_layers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import {AveragePooling1D, AveragePooling2D, AveragePooling3D, GlobalAveragePooli
import {GRU, GRUCell, GRUCellLayerArgs, GRULayerArgs, LSTM, LSTMCell, LSTMCellLayerArgs, LSTMLayerArgs, RNN, RNNCell, RNNLayerArgs, SimpleRNN, SimpleRNNCell, SimpleRNNCellLayerArgs, SimpleRNNLayerArgs, StackedRNNCells, StackedRNNCellsArgs} from './layers/recurrent';
import {Bidirectional, BidirectionalLayerArgs, TimeDistributed, WrapperLayerArgs} from './layers/wrappers';
import { Rescaling, RescalingArgs } from './layers/preprocessing/image_preprocessing';

import { CategoryEncoding, CategoryEncodingArgs } from './layers/preprocessing/category_encoding';
// TODO(cais): Add doc string to all the public static functions in this
// class; include exectuable JavaScript code snippets where applicable
// (b/74074458).
Expand Down Expand Up @@ -1729,3 +1729,44 @@ export function masking(args?: MaskingArgs) {
export function rescaling(args?: RescalingArgs) {
return new Rescaling(args);
}

/**
* A preprocessing layer which encodes integer features.
*
* This layer provides options for condensing data into a categorical encoding
* when the total number of tokens are known in advance. It accepts integer
* values as inputs, and it outputs a dense or sparse representation of those
* inputs.
*
* Arguments:
*
* numTokens: The total number of tokens the layer should support. All
* inputs to the layer must integers in the range `0 <= value <
* num_tokens`, or an error will be thrown.
*
* outputMode: Specification for the output of the layer.
* Defaults to `"multiHot". Values can be "oneHot", "multiHot" or
* "count", configuring the layer as follows:
*
* oneHot: Encodes each individual element in the input into an
* array of numTokens size, containing a 1 at the element index. If
* the last dimension is size 1, will encode on that dimension. If the
* last dimension is not size 1, will append a new dimension for the
* encoded output.
*
* multiHot: Encodes each sample in the input into a single array
* of `num_tokens` size, containing a 1 for each vocabulary term
* present in the sample. Treats the last dimension as the sample
* dimension, if input shape is `(..., sample_length)`, output shape
* will be `(..., numTokens)`.
*
* count: Like "multiHot", but the int array contains a count of
* the number of times the token at that index appeared in the sample.
*
* For all output modes, currently only output up to rank 2 is supported.
*
* @doc {heading: 'Layers', subheading: 'CategoryEncoding', namespace: 'layers'}
*/
export function categoryEncoding(args: CategoryEncodingArgs) {
return new CategoryEncoding(args);
}
114 changes: 114 additions & 0 deletions tfjs-layers/src/layers/preprocessing/category_encoding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* @license
* Copyright 2022 CodeSmith LLC
*
* Use of this source code is governed by an MIT-style
* license that can be found in the LICENSE file or at
* https://opensource.org/licenses/MIT.
* =============================================================================
*/

import { LayerArgs, Layer } from '../../engine/topology';
import { serialization, Tensor, tidy, Tensor1D, Tensor2D} from '@tensorflow/tfjs-core';
import { greater, greaterEqual, max, min} from '@tensorflow/tfjs-core';
import { Shape } from '../../keras_format/common';
import { getExactlyOneShape, getExactlyOneTensor } from '../../utils/types_utils';
import { Kwargs } from '../../types';
import { ValueError } from '../../errors';
import * as K from '../../backend/tfjs_backend';
import * as utils from './preprocessing_utils';

export declare interface CategoryEncodingArgs extends LayerArgs {
numTokens: number;
outputMode?: string;
}

export class CategoryEncoding extends Layer {
/** @nocollapse */
static className = 'CategoryEncoding';
private readonly numTokens: number;
private readonly outputMode: string;

constructor(args: CategoryEncodingArgs) {
super(args);
this.numTokens = args.numTokens;

if(args.outputMode) {
this.outputMode = args.outputMode;
} else {
this.outputMode = utils.multiHot;
}
}

getConfig(): serialization.ConfigDict {
const config: serialization.ConfigDict = {
'numTokens': this.numTokens,
'outputMode': this.outputMode,
};

const baseConfig = super.getConfig();
Object.assign(config, baseConfig);
return config;
}

computeOutputShape(inputShape: Shape|Shape[]): Shape|Shape[] {
inputShape = getExactlyOneShape(inputShape);

if(inputShape == null) {
return [this.numTokens];
}

if(this.outputMode === utils.oneHot && inputShape[-1] !== 1) {
inputShape.push(this.numTokens);
return inputShape;
}

inputShape[-1] = this.numTokens;
return inputShape;
}

call(inputs: Tensor|Tensor[], kwargs: Kwargs): Tensor[]|Tensor {
return tidy(() => {

inputs = getExactlyOneTensor(inputs);
if(inputs.dtype !== 'int32') {
inputs = K.cast(inputs, 'int32');
}

let countWeights;

if((typeof kwargs['countWeights']) !== 'undefined') {

if(this.outputMode !== utils.count) {
throw new ValueError(
`countWeights is not used when outputMode !== count.
Received countWeights=${kwargs['countWeights']}`);
}
const countWeightsRanked = getExactlyOneTensor(kwargs['countWeights']);

if(countWeightsRanked.rank === 1) {
countWeights = countWeightsRanked as Tensor1D;
} if(countWeightsRanked.rank === 2) {
countWeights = countWeightsRanked as Tensor2D;
}
}

const depth = this.numTokens;
const maxValue = max(inputs);
const minValue = min(inputs);

const greaterEqualMax = greater(depth, maxValue).bufferSync().get(0);
const greaterMin = greaterEqual(minValue, 0).bufferSync().get(0);

if(!(greaterEqualMax && greaterMin)) {
throw new ValueError(
`Input values must be between 0 < values <= numTokens`);
}

return utils.encodeCategoricalInputs(inputs,
this.outputMode, depth, countWeights);
});
}
}

serialization.registerClass(CategoryEncoding);
141 changes: 141 additions & 0 deletions tfjs-layers/src/layers/preprocessing/category_encoding_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import { describeMathCPUAndGPU, expectTensorsClose} from '../../utils/test_utils';
import { Tensor, tensor} from '@tensorflow/tfjs-core';
import { CategoryEncoding } from './category_encoding';
import * as utils from './preprocessing_utils';

describeMathCPUAndGPU('Layer Output', () => {

it('Calculates correct output for Count outputMode rank 0', () => {
const categoryData = tensor(0);
const expectedOutput = tensor([1,0,0,0]);
const numTokens = 4;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.count});
const computedOutput = encodingLayer.
apply(categoryData) as Tensor;

expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output for Count outputMode rank 1 (weights)', () => {
const categoryData = tensor([1, 2, 3, 3, 0]);
const weightData = tensor([1, 2, 3, 1, 7]);
const numTokens = 6;
const expectedOutput = tensor([7, 1, 2, 4, 0, 0]);
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.count});

const computedOutput = encodingLayer.apply(categoryData,
{countWeights: weightData}) as Tensor;

expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output for Count outputMode rank 2', () => {
const categoryData = tensor([[1, 2, 3, 1], [0, 3, 1, 0]]);
const expectedOutput = tensor([[0, 2, 1, 1, 0, 0], [2, 1, 0, 1, 0, 0]]);
const numTokens = 6;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.count});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output for oneHot outputMode rank 0', () => {
const categoryData = tensor(3);
const expectedOutput = tensor([0, 0, 0, 1]);
const numTokens = 4;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.oneHot});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output and shape for oneHot outputMode rank 1', () => {
const categoryData = tensor([3, 2, 0, 1]);
const expectedOutput = tensor([[0, 0, 0, 1],
[0, 0, 1, 0],
[1, 0, 0, 0],
[0, 1, 0, 0]]);
const numTokens = 4;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.oneHot});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output and shape for oneHot outputMode rank 2', () => {
const categoryData = tensor([[3], [2], [0], [1]]);
const expectedOutput = tensor([[0, 0, 0, 1],
[0, 0, 1, 0],
[1, 0, 0, 0],
[0, 1, 0, 0]]);
const numTokens = 4;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.oneHot});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output for multiHot outputMode rank 0', () => {
const categoryData = tensor(3);
const expectedOutput = tensor([0, 0, 0, 1, 0, 0]);
const numTokens = 6;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.oneHot});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output for multiHot outputMode rank 1', () => {
const categoryData = tensor([3, 2, 0, 1]);
const expectedOutput = tensor([1, 1, 1, 1, 0, 0]);
const numTokens = 6;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.multiHot});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Calculates correct output for multiHot outputMode rank 2', () => {
const categoryData = tensor([[0, 1], [0, 0], [1, 2], [3, 1]]);
const expectedOutput = tensor([[1, 1, 0, 0],
[1, 0, 0, 0],
[0, 1, 1, 0],
[0, 1, 0, 1]]);
const numTokens = 4;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.multiHot});
const computedOutput = encodingLayer.apply(categoryData) as Tensor;
expectTensorsClose(computedOutput, expectedOutput);
});

it('Raises Value Error if input Tensor has Rank > 2', () =>{
const categoryData = tensor([[[1], [2]], [[3], [4]]]);
const numTokens = 6;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.multiHot});
expect(() => encodingLayer.apply(categoryData))
.toThrowError(`When outputMode is not 'int', maximum output rank is 2
Received outputMode ${utils.multiHot} and input shape ${categoryData.shape}
which would result in output rank ${categoryData.rank}.`);
});

it('Raises Value Error if max input value !<= numTokens', () => {
const categoryData = tensor([7, 2, 0, 1]);
const numTokens = 3;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.multiHot});
expect(() => encodingLayer.apply(categoryData))
.toThrowError(`Input values must be between 0 < values <= numTokens`);
});

it('Raises Value Error if min input value < 0', () => {
const categoryData = tensor([7, 2, -1, 1]);
const numTokens = 3;
const encodingLayer = new CategoryEncoding({numTokens,
outputMode: utils.multiHot});
expect(() => encodingLayer.apply(categoryData))
.toThrowError(`Input values must be between 0 < values <= numTokens`);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export declare interface RescalingArgs extends LayerArgs {
*
* This rescales images by a scaling and offset factor
*/

export class Rescaling extends Layer {
/** @nocollapse */
static className = 'Rescaling';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,16 @@ describeMathCPUAndGPU('Rescaling Layer', () => {
const expectedOutputTensor = add(mul(intTensor, scale), offset);
const scalingLayer = new Rescaling({scale, offset});
const outputTensor = scalingLayer.apply(intTensor) as Tensor;
expect(outputTensor.dtype).toBe('float32');
expectTensorsClose(outputTensor, expectedOutputTensor);
expect(outputTensor.dtype).toBe('float32');
expectTensorsClose(outputTensor, expectedOutputTensor);
});

it('Config holds correct name', () => {
const scale = 1.0 / 127.5;
const offset = -1.0;
const scalingLayer = new Rescaling({scale, offset, name: 'Rescaling'});
const config = scalingLayer.getConfig();
expect(config.name).toEqual('Rescaling');
expect(config.name).toEqual('Rescaling');
});

});

0 comments on commit 9f29a29

Please sign in to comment.