From 69b0872fd25c2f4a796c5d4b9b0f2d4aa890af0d Mon Sep 17 00:00:00 2001 From: FoxxMD Date: Thu, 26 Oct 2023 15:24:54 -0400 Subject: [PATCH] docs: Add new features and fix import usage --- README.md | 39 ++++++++++++++++++++--- dist/commonjs/atomic.d.ts | 12 +++++++ dist/commonjs/index.d.ts | 1 + dist/commonjs/index.js | 40 ++++++++++++++++++++++-- dist/commonjs/index.js.map | 2 +- dist/commonjs/normalization/index.js | 4 +-- dist/commonjs/normalization/index.js.map | 2 +- dist/esm/atomic.d.ts | 12 +++++++ dist/esm/index.d.ts | 1 + dist/esm/index.js | 37 ++++++++++++++++++++-- dist/esm/index.js.map | 2 +- dist/esm/normalization/index.js | 4 +-- dist/esm/normalization/index.js.map | 2 +- 13 files changed, 140 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index e5c20db..462fc5c 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,14 @@ Pass a list of `ComparisonStrategy` objects using `{strategies: []}` to define w The average of the scores from all passed strategies is returned as `highScore` (and `highScoreWeighted`) from `stringSameness()` -When no strategies are explicitly passed a default set of strategies is used, found in `@foxxmd/string-sameness/strategies`: +When no strategies are explicitly passed a default set of strategies is used, found in `import {defaultStrategies} from @foxxmd/string-sameness;`: * [Dice's Coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) in [`diceSimilarities.ts`](/src/matchingStrategies/diceSimilarity.ts) * [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) in [`cosineSimilarities.ts`](/src/matchingStrategies/cosineSimilarity.ts) * [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) in [`levenSimilarities.ts`](/src/matchingStrategies/levenSimilarity.ts) +Strategies can be accessed individually using `import {strategies} from @foxxmd/string-sameness` + ### Bring Your Own Strategy Use your own strategy by creating an object that conforms to `ComparisonStrategy`: @@ -107,12 +109,14 @@ const result = stringSameness('This is one sentence', 'This is another sentence' Pass a list of functions using `{transforms: []}` to transform the strings before comparison. When not explicitly provided a default set of functions is applied to normalize the strings (to remove trivial differences): +* normalize unicode EX convert Ö => O * convert to lowercase * trim (remove whitespace at beginning/end) * remove non-alphanumeric characters (punctuation and newlines) * replace any instances of 2 or more consecutive whitespace with 1 whitespace -This default set of functions is exported as `defaultStrCompareTransformFuncs`. +* The default set of transformer functions is exported as `import {strDefaultTransforms} from @foxxmd/string-sameness;` +* All built-in transformers can be found at `import {transforms} from @foxxmd/string-sameness;` Example of supplying your own transform functions: @@ -128,17 +132,42 @@ const myFuncs = [ const result = stringSameness('This is one sentence', 'This is another sentence', {transforms: myFuncs}); ``` +## Token Re-ordering + +If tokens (word) ordering in the strings is not important you can choose to have string-sameness attempt to re-order all words before comparing sameness. This makes comparison scores much closer to "absolute sameness in all characters within string". EX: + +* `this is correct order` +* `order correct this is` + +Scores 60 **without** reordering + +Scores 100 **with** reordering + +Behavior caveats: + +* The **second** string argument is reordered to match the **first** string argument +* If the second string is longer than the first than any non-matched words are concatenated to the end of the re-ordered string in the same order they were found + +To use: + +```js +import {stringSameness} from '@foxxmd/string-sameness'; + +const res = stringSameness(strA, strB, {reorder: true}); +``` + ## Factory For convenience, a factory function is also provided: ```ts -import {createStringSameness} from "@foxxmd/string-sameness"; -import {levenStrategy} from "@foxxmd/string-sameness/strategies"; +import {createStringSameness, strategies} from "@foxxmd/string-sameness"; import {myTransforms, myStrats} from './util'; +const {levenStrategy} = strategies; + // sets the default object to used with the third argument for `stringSameness` -const myCompare = createStringSameness({transforms: myTransforms, strategies: myStrats}); +const myCompare = createStringSameness({transforms: myTransforms, strategies: [levenStrategy, ...myStrats]}); // uses myTransforms and myStrats const plainResult = myCompare('This is one sentence', 'This is another sentence'); diff --git a/dist/commonjs/atomic.d.ts b/dist/commonjs/atomic.d.ts index deb10e9..4586faa 100644 --- a/dist/commonjs/atomic.d.ts +++ b/dist/commonjs/atomic.d.ts @@ -1,6 +1,18 @@ export interface StringComparisonOptions { + /** + * An array of transformations to apply to each string before comparing similarity + * */ transforms?: StringTransformFunc[]; + /** + * An array of strategies used to score similarity. All strategies scores are combined for an average high score. + * */ strategies?: ComparisonStrategy[]; + /** + * Reorder second string so its token match order of first string as closely as possible + * + * Useful when only the differences in content are important, but not the order of the content + * */ + reorder?: boolean; } export interface StringSamenessResult { strategies: { diff --git a/dist/commonjs/index.d.ts b/dist/commonjs/index.d.ts index a9ac7c2..480e7c1 100644 --- a/dist/commonjs/index.d.ts +++ b/dist/commonjs/index.d.ts @@ -2,6 +2,7 @@ import { ComparisonStrategyResult, StringComparisonOptions, StringSamenessResult import { strDefaultTransforms, transforms } from "./normalization/index.js"; declare const defaultStrategies: import("./atomic.js").ComparisonStrategy[]; declare const stringSameness: (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; +export declare const reorderStr: (cleanA: string, cleanB: string, options?: StringComparisonOptions) => string; declare const createStringSameness: (defaults: StringComparisonOptions) => (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; declare const strategies: { diceStrategy: import("./atomic.js").ComparisonStrategy; diff --git a/dist/commonjs/index.js b/dist/commonjs/index.js index bcaffef..67ad753 100644 --- a/dist/commonjs/index.js +++ b/dist/commonjs/index.js @@ -1,6 +1,6 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -exports.strDefaultTransforms = exports.defaultStrCompareTransformFuncs = exports.transforms = exports.strategies = exports.defaultStrategies = exports.createStringSameness = exports.stringSameness = void 0; +exports.strDefaultTransforms = exports.defaultStrCompareTransformFuncs = exports.transforms = exports.strategies = exports.defaultStrategies = exports.createStringSameness = exports.stringSameness = exports.reorderStr = void 0; const index_js_1 = require("./matchingStrategies/index.js"); const index_js_2 = require("./normalization/index.js"); Object.defineProperty(exports, "strDefaultTransforms", { enumerable: true, get: function () { return index_js_2.strDefaultTransforms; } }); @@ -17,10 +17,16 @@ const defaultStrategies = [ ]; exports.defaultStrategies = defaultStrategies; const stringSameness = (valA, valB, options) => { - const { transforms = index_js_2.strDefaultTransforms, strategies = defaultStrategies, } = options || {}; + const { transforms = index_js_2.strDefaultTransforms, strategies = defaultStrategies, reorder = false, } = options || {}; const cleanA = transforms.reduce((acc, curr) => curr(acc), valA); - const cleanB = transforms.reduce((acc, curr) => curr(acc), valB); + let cleanB = transforms.reduce((acc, curr) => curr(acc), valB); const shortest = cleanA.length > cleanB.length ? cleanB : cleanA; + if (reorder) { + // we want to ignore order of tokens as much as possible (user does not care about differences in word order, just absolute differences in characters overall) + // so we will reorder cleanB so its tokens match the order or tokens in cleanA as closely as possible + // before we run strategies + cleanB = (0, exports.reorderStr)(cleanA, cleanB); + } const stratResults = []; for (const strat of strategies) { if (strat.isValid !== undefined && !strat.isValid(cleanA, cleanB)) { @@ -54,6 +60,34 @@ const stringSameness = (valA, valB, options) => { }; }; exports.stringSameness = stringSameness; +const reorderStr = (cleanA, cleanB, options) => { + // to do the reordering we will use stringSameness with the provided strats to match against each token in cleanA and choose the closest token in cleanB + // and add the end concat any remaining tokens from cleanB to the reordered string + const aTokens = cleanA.split(' '); + const bTokens = cleanB.split(' '); + const orderedCandidateTokens = aTokens.reduce((acc, curr) => { + let highScore = 0; + let highIndex = 0; + let index = 0; + for (const token of acc.remaining) { + const result = stringSameness(curr, token, { ...options, reorder: false }); + if (result.highScore > highScore) { + highScore = result.highScore; + highIndex = index; + } + index++; + } + const splicedRemaining = [...acc.remaining]; + if (highIndex <= splicedRemaining.length - 1) { + splicedRemaining.splice(highIndex, 1); + } + const ordered = highIndex <= acc.remaining.length - 1 ? acc.ordered.concat(acc.remaining[highIndex]) : acc.ordered; + return { ordered: ordered, remaining: splicedRemaining }; + }, { ordered: [], remaining: bTokens }); + const allOrderedCandidateTokens = orderedCandidateTokens.ordered.concat(orderedCandidateTokens.remaining); + return allOrderedCandidateTokens.join(' '); +}; +exports.reorderStr = reorderStr; const createStringSameness = (defaults) => { return (valA, valB, options = {}) => stringSameness(valA, valB, { ...defaults, ...options }); }; diff --git a/dist/commonjs/index.js.map b/dist/commonjs/index.js.map index c217079..e75075e 100644 --- a/dist/commonjs/index.js.map +++ b/dist/commonjs/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;AAAA,4DAAoH;AAQpH,uDAA0E;AAqFtE,qGArFI,+BAAoB,OAqFJ;AAFpB,2FAnF0B,qBAAU,OAmF1B;AAjFd,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,uBAAY;IACZ,wBAAa;IACb,yBAAc;CACjB,CAAA;AAqEG,8CAAiB;AAnErB,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,+BAAoB,EACjC,UAAU,GAAG,iBAAiB,GACjC,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAEjE,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAmBG,wCAAc;AAjBlB,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAgBG,oDAAoB;AAdxB,MAAM,UAAU,GAAG;IACf,YAAY,EAAZ,uBAAY;IACZ,aAAa,EAAb,wBAAa;IACb,cAAc,EAAd,yBAAc;IACd,wBAAwB,EAAxB,mCAAwB;CAC3B,CAAC;AAWE,gCAAU;AATd,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,+BAAoB,CAAC;AAUzD,0EAA+B"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;AAAA,4DAAoH;AAQpH,uDAA0E;AA2HtE,qGA3HI,+BAAoB,OA2HJ;AAFpB,2FAzH0B,qBAAU,OAyH1B;AAvHd,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,uBAAY;IACZ,wBAAa;IACb,yBAAc;CACjB,CAAA;AA2GG,8CAAiB;AAzGrB,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,+BAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,OAAO,GAAG,KAAK,GAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAE/D,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,IAAI,OAAO,EAAE;QACT,8JAA8J;QAC9J,qGAAqG;QACrG,2BAA2B;QAC3B,MAAM,GAAG,IAAA,kBAAU,EAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KACvC;IAED,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAiDG,wCAAc;AA/CX,MAAM,UAAU,GAAG,CAAC,MAAc,EAAE,MAAc,EAAE,OAAiC,EAAU,EAAE;IACpG,wJAAwJ;IACxJ,kFAAkF;IAClF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,sBAAsB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAA+C,EAAE,IAAI,EAAE,EAAE;QACpG,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,SAAS,GAAuB,CAAC,CAAC;QACtC,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,SAAS,EAAE;YAC/B,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,EAAE,EAAC,GAAG,OAAO,EAAE,OAAO,EAAE,KAAK,EAAC,CAAC,CAAC;YACzE,IAAI,MAAM,CAAC,SAAS,GAAG,SAAS,EAAE;gBAC9B,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;gBAC7B,SAAS,GAAG,KAAK,CAAC;aACrB;YACD,KAAK,EAAE,CAAC;SACX;QAED,MAAM,gBAAgB,GAAG,CAAC,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC;QAC5C,IAAG,SAAS,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE;YACzC,gBAAgB,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;SACzC;QACD,MAAM,OAAO,GAAG,SAAS,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC;QAEnH,OAAO,EAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAC,CAAC;IAC3D,CAAC,EAAE,EAAC,OAAO,EAAE,EAAE,EAAE,SAAS,EAAE,OAAO,EAAC,CAAC,CAAC;IACtC,MAAM,yBAAyB,GAAG,sBAAsB,CAAC,OAAO,CAAC,MAAM,CAAC,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAC1G,OAAO,yBAAyB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC,CAAA;AA5BY,QAAA,UAAU,cA4BtB;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAgBG,oDAAoB;AAdxB,MAAM,UAAU,GAAG;IACf,YAAY,EAAZ,uBAAY;IACZ,aAAa,EAAb,wBAAa;IACb,cAAc,EAAd,yBAAc;IACd,wBAAwB,EAAxB,mCAAwB;CAC3B,CAAC;AAWE,gCAAU;AATd,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,+BAAoB,CAAC;AAUzD,0EAA+B"} \ No newline at end of file diff --git a/dist/commonjs/normalization/index.js b/dist/commonjs/normalization/index.js index 73080ed..cd77c4a 100644 --- a/dist/commonjs/normalization/index.js +++ b/dist/commonjs/normalization/index.js @@ -3,7 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true }); exports.strDefaultTransforms = exports.transforms = exports.replaceMultiWhitespace = exports.removeWhitespace = exports.removePunctuation = exports.replaceUnicode = exports.trim = exports.lowercase = void 0; const PUNCTUATION_REGEX = new RegExp(/[^\w\s]|_/g); const WHITESPACE_REGEX = new RegExp(/\s/g); -const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}|\n/g); +const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}/g); const lowercase = (str) => str.toLocaleLowerCase(); exports.lowercase = lowercase; const trim = (str) => str.trim(); @@ -14,7 +14,7 @@ const removePunctuation = (str) => str.replace(PUNCTUATION_REGEX, ''); exports.removePunctuation = removePunctuation; const removeWhitespace = (str) => str.replace(WHITESPACE_REGEX, ''); exports.removeWhitespace = removeWhitespace; -const replaceMultiWhitespace = (str) => str.replace(MULTI_WHITESPACE_REGEX, ''); +const replaceMultiWhitespace = (str) => str.replace(MULTI_WHITESPACE_REGEX, ' '); exports.replaceMultiWhitespace = replaceMultiWhitespace; const transforms = { lowercase, diff --git a/dist/commonjs/normalization/index.js.map b/dist/commonjs/normalization/index.js.map index 84f643e..f19e49f 100644 --- a/dist/commonjs/normalization/index.js.map +++ b/dist/commonjs/normalization/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":";;;AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACnD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AAExD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAyB5E,8BAAS;AAxBb,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAyB1D,oBAAI;AAxBR,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAyB9G,wCAAc;AAxBlB,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAyB/F,8CAAiB;AAxBrB,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AAyB7F,4CAAgB;AAxBpB,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,EAAE,CAAC,CAAC;AAyBzG,wDAAsB;AAvB1B,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAiBG,gCAAU;AAfd,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAUE,oDAAoB"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":";;;AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACnD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;AAErD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAyB5E,8BAAS;AAxBb,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAyB1D,oBAAI;AAxBR,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAyB9G,wCAAc;AAxBlB,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAyB/F,8CAAiB;AAxBrB,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AAyB7F,4CAAgB;AAxBpB,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;AAyB1G,wDAAsB;AAvB1B,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAiBG,gCAAU;AAfd,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAUE,oDAAoB"} \ No newline at end of file diff --git a/dist/esm/atomic.d.ts b/dist/esm/atomic.d.ts index deb10e9..4586faa 100644 --- a/dist/esm/atomic.d.ts +++ b/dist/esm/atomic.d.ts @@ -1,6 +1,18 @@ export interface StringComparisonOptions { + /** + * An array of transformations to apply to each string before comparing similarity + * */ transforms?: StringTransformFunc[]; + /** + * An array of strategies used to score similarity. All strategies scores are combined for an average high score. + * */ strategies?: ComparisonStrategy[]; + /** + * Reorder second string so its token match order of first string as closely as possible + * + * Useful when only the differences in content are important, but not the order of the content + * */ + reorder?: boolean; } export interface StringSamenessResult { strategies: { diff --git a/dist/esm/index.d.ts b/dist/esm/index.d.ts index a9ac7c2..480e7c1 100644 --- a/dist/esm/index.d.ts +++ b/dist/esm/index.d.ts @@ -2,6 +2,7 @@ import { ComparisonStrategyResult, StringComparisonOptions, StringSamenessResult import { strDefaultTransforms, transforms } from "./normalization/index.js"; declare const defaultStrategies: import("./atomic.js").ComparisonStrategy[]; declare const stringSameness: (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; +export declare const reorderStr: (cleanA: string, cleanB: string, options?: StringComparisonOptions) => string; declare const createStringSameness: (defaults: StringComparisonOptions) => (valA: string, valB: string, options?: StringComparisonOptions) => StringSamenessResult; declare const strategies: { diceStrategy: import("./atomic.js").ComparisonStrategy; diff --git a/dist/esm/index.js b/dist/esm/index.js index 9740d73..65ee2e7 100644 --- a/dist/esm/index.js +++ b/dist/esm/index.js @@ -11,10 +11,16 @@ const defaultStrategies = [ cosineStrategy ]; const stringSameness = (valA, valB, options) => { - const { transforms = strDefaultTransforms, strategies = defaultStrategies, } = options || {}; + const { transforms = strDefaultTransforms, strategies = defaultStrategies, reorder = false, } = options || {}; const cleanA = transforms.reduce((acc, curr) => curr(acc), valA); - const cleanB = transforms.reduce((acc, curr) => curr(acc), valB); + let cleanB = transforms.reduce((acc, curr) => curr(acc), valB); const shortest = cleanA.length > cleanB.length ? cleanB : cleanA; + if (reorder) { + // we want to ignore order of tokens as much as possible (user does not care about differences in word order, just absolute differences in characters overall) + // so we will reorder cleanB so its tokens match the order or tokens in cleanA as closely as possible + // before we run strategies + cleanB = reorderStr(cleanA, cleanB); + } const stratResults = []; for (const strat of strategies) { if (strat.isValid !== undefined && !strat.isValid(cleanA, cleanB)) { @@ -47,6 +53,33 @@ const stringSameness = (valA, valB, options) => { highScoreWeighted, }; }; +export const reorderStr = (cleanA, cleanB, options) => { + // to do the reordering we will use stringSameness with the provided strats to match against each token in cleanA and choose the closest token in cleanB + // and add the end concat any remaining tokens from cleanB to the reordered string + const aTokens = cleanA.split(' '); + const bTokens = cleanB.split(' '); + const orderedCandidateTokens = aTokens.reduce((acc, curr) => { + let highScore = 0; + let highIndex = 0; + let index = 0; + for (const token of acc.remaining) { + const result = stringSameness(curr, token, { ...options, reorder: false }); + if (result.highScore > highScore) { + highScore = result.highScore; + highIndex = index; + } + index++; + } + const splicedRemaining = [...acc.remaining]; + if (highIndex <= splicedRemaining.length - 1) { + splicedRemaining.splice(highIndex, 1); + } + const ordered = highIndex <= acc.remaining.length - 1 ? acc.ordered.concat(acc.remaining[highIndex]) : acc.ordered; + return { ordered: ordered, remaining: splicedRemaining }; + }, { ordered: [], remaining: bTokens }); + const allOrderedCandidateTokens = orderedCandidateTokens.ordered.concat(orderedCandidateTokens.remaining); + return allOrderedCandidateTokens.join(' '); +}; const createStringSameness = (defaults) => { return (valA, valB, options = {}) => stringSameness(valA, valB, { ...defaults, ...options }); }; diff --git a/dist/esm/index.js.map b/dist/esm/index.js.map index c57cb68..eabec69 100644 --- a/dist/esm/index.js.map +++ b/dist/esm/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAE,aAAa,EAAE,YAAY,EAAE,wBAAwB,EAAC,MAAM,+BAA+B,CAAC;AAQpH,OAAO,EAAC,oBAAoB,EAAE,UAAU,EAAC,MAAM,0BAA0B,CAAC;AAE1E,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,YAAY;IACZ,aAAa;IACb,cAAc;CACjB,CAAA;AAED,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,oBAAoB,EACjC,UAAU,GAAG,iBAAiB,GACjC,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAEjE,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAED,MAAM,UAAU,GAAG;IACf,YAAY;IACZ,aAAa;IACb,cAAc;IACd,wBAAwB;CAC3B,CAAC;AAEF,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,oBAAoB,CAAC;AAE7D,OAAO,EAGH,cAAc,EACd,oBAAoB,EACpB,iBAAiB,EACjB,UAAU,EACV,UAAU,EACV,+BAA+B,EAC/B,oBAAoB,EAGvB,CAAA"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,cAAc,EAAE,aAAa,EAAE,YAAY,EAAE,wBAAwB,EAAC,MAAM,+BAA+B,CAAC;AAQpH,OAAO,EAAC,oBAAoB,EAAE,UAAU,EAAC,MAAM,0BAA0B,CAAC;AAE1E,MAAM,oBAAoB,GAAG,CAAC,MAAc,EAAE,EAAE;IAC5C,oBAAoB;IACpB,4BAA4B;IAC5B,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;AACzC,CAAC,CAAA;AAED,MAAM,iBAAiB,GAAG;IACtB,YAAY;IACZ,aAAa;IACb,cAAc;CACjB,CAAA;AAED,MAAM,cAAc,GAAG,CAAC,IAAY,EAAE,IAAY,EAAE,OAAiC,EAAwB,EAAE;IAE3G,MAAM,EACF,UAAU,GAAG,oBAAoB,EACjC,UAAU,GAAG,iBAAiB,EAC9B,OAAO,GAAG,KAAK,GAClB,GAAG,OAAO,IAAI,EAAE,CAAC;IAElB,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IACjE,IAAI,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,IAAI,CAAC,CAAC;IAE/D,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;IAEjE,IAAI,OAAO,EAAE;QACT,8JAA8J;QAC9J,qGAAqG;QACrG,2BAA2B;QAC3B,MAAM,GAAG,UAAU,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KACvC;IAED,MAAM,YAAY,GAA0C,EAAE,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE;QAC5B,IAAI,KAAK,CAAC,OAAO,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE;YAC/D,SAAS;SACZ;QACD,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3C,MAAM,MAAM,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAC,KAAK,EAAE,GAAG,EAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC5D,YAAY,CAAC,IAAI,CAAC;YACd,GAAG,MAAM;YACT,IAAI,EAAE,KAAK,CAAC,IAAI;SACnB,CAAC,CAAC;KACN;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,oBAAoB,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IAE1D,qBAAqB;IACrB,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC;IAChG,kCAAkC;IAClC,MAAM,iBAAiB,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,GAAgD,EAAE,IAAI,EAAE,EAAE;QAC5F,MAAM,EAAC,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAC,GAAG,IAAI,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACb,GAAG,IAAI;YACP,KAAK;SACR,CAAC;QACF,OAAO,GAAG,CAAC;IACf,CAAC,EAAE,EAAE,CAAC,CAAC;IACP,OAAO;QACH,UAAU,EAAE,QAAQ;QACpB,SAAS;QACT,iBAAiB;KACpB,CAAA;AACL,CAAC,CAAA;AAED,MAAM,CAAC,MAAM,UAAU,GAAG,CAAC,MAAc,EAAE,MAAc,EAAE,OAAiC,EAAU,EAAE;IACpG,wJAAwJ;IACxJ,kFAAkF;IAClF,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAClC,MAAM,sBAAsB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAA+C,EAAE,IAAI,EAAE,EAAE;QACpG,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,SAAS,GAAuB,CAAC,CAAC;QACtC,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,MAAM,KAAK,IAAI,GAAG,CAAC,SAAS,EAAE;YAC/B,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,KAAK,EAAE,EAAC,GAAG,OAAO,EAAE,OAAO,EAAE,KAAK,EAAC,CAAC,CAAC;YACzE,IAAI,MAAM,CAAC,SAAS,GAAG,SAAS,EAAE;gBAC9B,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;gBAC7B,SAAS,GAAG,KAAK,CAAC;aACrB;YACD,KAAK,EAAE,CAAC;SACX;QAED,MAAM,gBAAgB,GAAG,CAAC,GAAG,GAAG,CAAC,SAAS,CAAC,CAAC;QAC5C,IAAG,SAAS,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE;YACzC,gBAAgB,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC;SACzC;QACD,MAAM,OAAO,GAAG,SAAS,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC;QAEnH,OAAO,EAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAC,CAAC;IAC3D,CAAC,EAAE,EAAC,OAAO,EAAE,EAAE,EAAE,SAAS,EAAE,OAAO,EAAC,CAAC,CAAC;IACtC,MAAM,yBAAyB,GAAG,sBAAsB,CAAC,OAAO,CAAC,MAAM,CAAC,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAC1G,OAAO,yBAAyB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC/C,CAAC,CAAA;AAED,MAAM,oBAAoB,GAAG,CAAC,QAAiC,EAAE,EAAE;IAC/D,OAAO,CAAC,IAAY,EAAE,IAAY,EAAE,UAAmC,EAAE,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAC,GAAG,QAAQ,EAAE,GAAG,OAAO,EAAC,CAAC,CAAC;AACxI,CAAC,CAAA;AAED,MAAM,UAAU,GAAG;IACf,YAAY;IACZ,aAAa;IACb,cAAc;IACd,wBAAwB;CAC3B,CAAC;AAEF,4BAA4B;AAC5B,MAAM,+BAA+B,GAAG,oBAAoB,CAAC;AAE7D,OAAO,EAGH,cAAc,EACd,oBAAoB,EACpB,iBAAiB,EACjB,UAAU,EACV,UAAU,EACV,+BAA+B,EAC/B,oBAAoB,EAGvB,CAAA"} \ No newline at end of file diff --git a/dist/esm/normalization/index.js b/dist/esm/normalization/index.js index 8029f5d..1d95387 100644 --- a/dist/esm/normalization/index.js +++ b/dist/esm/normalization/index.js @@ -1,12 +1,12 @@ const PUNCTUATION_REGEX = new RegExp(/[^\w\s]|_/g); const WHITESPACE_REGEX = new RegExp(/\s/g); -const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}|\n/g); +const MULTI_WHITESPACE_REGEX = new RegExp(/\s{2,}/g); const lowercase = (str) => str.toLocaleLowerCase(); const trim = (str) => str.trim(); const replaceUnicode = (str) => str.normalize('NFD').replace(/[\u0300-\u036f]/g, ""); const removePunctuation = (str) => str.replace(PUNCTUATION_REGEX, ''); const removeWhitespace = (str) => str.replace(WHITESPACE_REGEX, ''); -const replaceMultiWhitespace = (str) => str.replace(MULTI_WHITESPACE_REGEX, ''); +const replaceMultiWhitespace = (str) => str.replace(MULTI_WHITESPACE_REGEX, ' '); const transforms = { lowercase, trim, diff --git a/dist/esm/normalization/index.js.map b/dist/esm/normalization/index.js.map index 68fe96f..522b8b1 100644 --- a/dist/esm/normalization/index.js.map +++ b/dist/esm/normalization/index.js.map @@ -1 +1 @@ -{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":"AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACnD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AAExD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAChF,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAC9D,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAClH,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AACnG,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACjG,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,EAAE,CAAC,CAAC;AAE7G,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAED,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAEF,OAAO,EACH,SAAS,EACT,IAAI,EACJ,cAAc,EACd,iBAAiB,EACjB,gBAAgB,EAChB,sBAAsB,EACtB,UAAU,EACV,oBAAoB,EACvB,CAAA"} \ No newline at end of file +{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/normalization/index.ts"],"names":[],"mappings":"AAEA,MAAM,iBAAiB,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,CAAC;AACnD,MAAM,gBAAgB,GAAG,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;AAC3C,MAAM,sBAAsB,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;AAErD,MAAM,SAAS,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;AAChF,MAAM,IAAI,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;AAC9D,MAAM,cAAc,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;AAClH,MAAM,iBAAiB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AACnG,MAAM,gBAAgB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;AACjG,MAAM,sBAAsB,GAAwB,CAAC,GAAW,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,sBAAsB,EAAE,GAAG,CAAC,CAAC;AAE9G,MAAM,UAAU,GAAG;IACf,SAAS;IACT,IAAI;IACJ,sBAAsB;IACtB,cAAc;IACd,gBAAgB;IAChB,iBAAiB;CACpB,CAAA;AAED,MAAM,oBAAoB,GAA0B;IAChD,cAAc;IACd,iBAAiB;IACjB,IAAI;IACJ,sBAAsB;IACtB,SAAS;CACZ,CAAC;AAEF,OAAO,EACH,SAAS,EACT,IAAI,EACJ,cAAc,EACd,iBAAiB,EACjB,gBAAgB,EAChB,sBAAsB,EACtB,UAAU,EACV,oBAAoB,EACvB,CAAA"} \ No newline at end of file