diff --git a/examples/parse-spanish-doc.js b/examples/parse-spanish-doc.js index 5a80280..7763ea9 100644 --- a/examples/parse-spanish-doc.js +++ b/examples/parse-spanish-doc.js @@ -1,17 +1,16 @@ -import CoreNLP from '../src'; +// NOTE: run with babel-node +import CoreNLP, { Properties, Pipeline } from '../src'; -// Simple (https://stanfordnlp.github.io/CoreNLP/simple.html) - -const props = new CoreNLP.Properties(); +const props = new Properties(); props.setProperty('annotators', 'tokenize,ssplit,pos,lemma,ner,parse'); -const pipeline = new CoreNLP.Pipeline(props, 'Spanish'); +const pipeline = new Pipeline(props, 'Spanish'); const doc = new CoreNLP.simple.Document('Jorge quiere cinco empanadas de queso y carne.'); pipeline.annotate(doc) - .then((doc) => { + .then(doc => { const sent = doc.sentence(0); console.log('parse', sent.parse()); - console.log(CoreNLP.util.Tree.fromSentence(sent, pipeline.getService()).dump()); + console.log(CoreNLP.util.Tree.fromSentence(sent).dump()); }) .catch(err => { console.log('err', err); diff --git a/examples/parse-spanish-sent.js b/examples/parse-spanish-sent.js index 7a113a8..d514ae2 100644 --- a/examples/parse-spanish-sent.js +++ b/examples/parse-spanish-sent.js @@ -1,16 +1,15 @@ -import CoreNLP from '../src'; +// NOTE: run with babel-node +import CoreNLP, { Properties, Pipeline } from '../src'; -// Simple (https://stanfordnlp.github.io/CoreNLP/simple.html) - -const props = new CoreNLP.Properties(); +const props = new Properties(); props.setProperty('annotators', 'tokenize,ssplit,pos,lemma,ner,parse'); -const pipeline = new CoreNLP.Pipeline(props, 'Spanish'); +const pipeline = new Pipeline(props, 'Spanish'); const sent = new CoreNLP.simple.Sentence('Jorge quiere cinco empanadas de queso y carne.'); pipeline.annotate(sent) - .then((sent) => { + .then(sent => { console.log('parse', sent.parse()); - console.log(CoreNLP.util.Tree.fromSentence(sent, pipeline.getService()).dump()); + console.log(CoreNLP.util.Tree.fromSentence(sent).dump()); }) .catch(err => { console.log('err', err); diff --git a/examples/parse.js b/examples/parse.js index 9c81cc4..55ee980 100644 --- a/examples/parse.js +++ b/examples/parse.js @@ -1,14 +1,13 @@ -import CoreNLP from '../src'; +// NOTE: run with babel-node +import CoreNLP, { Properties, Pipeline } from '../src'; -// Simple (https://stanfordnlp.github.io/CoreNLP/simple.html) - -const props = new CoreNLP.Properties(); +const props = new Properties(); props.setProperty('annotators', 'tokenize,ssplit,pos,lemma,ner,parse'); -const pipeline = new CoreNLP.Pipeline(props, 'English'); +const pipeline = new Pipeline(props, 'English'); const sent = new CoreNLP.simple.Sentence('The little dog runs so fast.'); pipeline.annotate(sent) - .then((sent) => { + .then(sent => { console.log('parse', sent.parse()); console.log(CoreNLP.util.Tree.fromSentence(sent).dump()); }) diff --git a/examples/regexner.js b/examples/regexner.js index 0940266..78bf840 100644 --- a/examples/regexner.js +++ b/examples/regexner.js @@ -1,31 +1,34 @@ -import CoreNLP from '../src'; +// NOTE: run with babel-node +import path from 'path'; +import CoreNLP, { Properties, Pipeline } from '../src'; // https://stanfordnlp.github.io/CoreNLP/regexner.html -const RegexNERAnnotator = new CoreNLP.simple.Annotator('regexner', { - validpospattern: `(([ner:PERSON]*) /es/ /una/ /buena/ /persona/)`, - //ignorecase: null, - //mapping: null, - //mapping.header: null, - //mapping.field.<fieldname>: null, - //commonWords: null, - //backgroundSymbol: null, - //posmatchtype: null, - //validpospattern: null, - //noDefaultOverwriteLabels: null, - verbose: true, - }, [ - CoreNLP.simple.annotator.NERClassifierCombiner, - ]); -const props = new CoreNLP.Properties(); -props.setProperty('annotators', 'tokenize,ssplit,pos,lemma,ner,parse'); -const sent = new CoreNLP.simple.Sentence('Juan Carlos es una buena persona'); -const pipeline = new CoreNLP.Pipeline(props, 'Spanish'); +const props = new Properties(); +props.setProperty('annotators', 'tokenize,ssplit,regexner'); +// IMPORTANT: when using ConnectorServer, this option needs to be set +// from serverProperties or per-language properties file, because otherwise gets overriden and doesn't work +// - the `regexner.tag` file is provided along with this example +props.setProperty('regexner.mapping', path.resolve('./regexner.tag')); +props.setProperty('regexner.ignorecase', true); +const sent = new CoreNLP.simple.Sentence('Me encantan las empanadas de carne picante'); +const pipeline = new Pipeline(props, 'Spanish'); pipeline.annotate(sent) - .then((sent) => { - console.log('parse', sent); + .then(sent => { + console.log('NER tags', sent.tokens().map(t => t.ner())); }) .catch(err => { console.log('err', err); }); + +/* +OUTPUT: +NER tags [ undefined, + undefined, + undefined, + undefined, + undefined, + 'INGREDIENT', + 'CONDIMENT' ] +*/ diff --git a/examples/regexner.tag b/examples/regexner.tag new file mode 100644 index 0000000..ea5aa67 --- /dev/null +++ b/examples/regexner.tag @@ -0,0 +1,69 @@ +quiero ORDER +tomates? INGREDIENT +toma tes? INGREDIENT +lechugas? INGREDIENT +pollos? INGREDIENT +pancetas? INGREDIENT +pepinos? INGREDIENT +carnes? INGREDIENT +cebollas? INGREDIENT +morrón|morrones INGREDIENT +anchoa INGREDIENT +roquefort INGREDIENT +quesos? INGREDIENT +provolone INGREDIENT +aceitunas? INGREDIENT +hielo INGREDIENT +azucar INGREDIENT +condimentos? CONDIMENT +oregano CONDIMENT +mayonesa CONDIMENT +k(e|é)tchup CONDIMENT 2.0 +mostaza CONDIMENT +barbacoa CONDIMENT +salsa golf CONDIMENT +sal CONDIMENT +pimienta CONDIMENT +vinagre CONDIMENT +aceite CONDIMENT +picante CONDIMENT +centavos? CURRENCY +menú MEAL +comida MEAL +papas? MEAL +papitas? MEAL +sánguche MEAL +hamburguesas? MEAL +ensaladas? MEAL +empanadas? MEAL +pizzas? MEAL +gaseosas? DRINK +jugos? DRINK +birras? DRINK +agua DRINK +pomelo DRINK +naranja DRINK +fanta DRINK +manaos DRINK +coca DRINK +coca cola DRINK +coca light DRINK +cono DESSERT +helado DESSERT +vainilla DESSERT +dulce de leche DESSERT +chocolate DESSERT +cine PLACE +deja ORDER_ADD +agrega ORDER_ADD +con ORDER_ADD +pone ORDER_ADD +ponele ORDER_ADD +pongas ORDER_ADD +saca ORDER_REMOVE +quita ORDER_REMOVE +sin ORDER_REMOVE +cambia ORDER_REPLACE +reemplaza ORDER_REPLACE +agranda ORDER_MUTATE +achica ORDER_MUTATE diff --git a/examples/semgrex.js b/examples/semgrex.js new file mode 100644 index 0000000..7c1d847 --- /dev/null +++ b/examples/semgrex.js @@ -0,0 +1,37 @@ +// NOTE: run with babel-node +import path from 'path'; +import CoreNLP, { Properties, Pipeline } from '../src'; + +const props = new Properties(); +props.setProperty('annotators', 'tokenize,ssplit,depparse,regexner'); +// IMPORTANT: when using ConnectorServer, this option needs to be set +// from serverProperties or per-language properties file, because otherwise gets overriden and doesn't work +// - the `regexner.tag` file is provided along with this example +props.setProperty('regexner.mapping', path.resolve('./regexner.tag')); +props.setProperty('regexner.ignorecase', true); +const expression = new CoreNLP.simple.Expression('Me encantan las empanadas de carne picante', '{ner:INGREDIENT}'); +const pipeline = new Pipeline(props, 'Spanish'); + +pipeline.annotateSemgrex(expression) + .then(data => { + console.log('semgrex data', JSON.stringify(data, null, '\t')); + }) + .catch(err => { + console.log('err', err); + }); + +/* +OUTPUT: +semgrex data { + "text": "Me encantan las empanadas de carne picante", + "sentences": [ + [ + { + "text": "carne", + "begin": 5, + "end": 6 + } + ] + ] +} +*/ diff --git a/scripts/corenlp-download b/scripts/corenlp-download index 7b01897..1238398 100755 --- a/scripts/corenlp-download +++ b/scripts/corenlp-download @@ -4,8 +4,8 @@ mkdir `dirname $0`/../corenlp; \ pushd `dirname $0`/../corenlp; \ -wget https://nlp.stanford.edu/software/stanford-spanish-corenlp-2017-06-09-models.jar; \ -wget https://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip; \ +curl -O https://nlp.stanford.edu/software/stanford-spanish-corenlp-2017-06-09-models.jar; \ +curl -O https://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip; \ unzip stanford-corenlp-full-2017-06-09.zip && rm stanford-corenlp-full-2017-06-09.zip; \ mv stanford-spanish-corenlp-2017-06-09-models.jar stanford-corenlp-full-2017-06-09; \ popd diff --git a/src/connector/connector-cli.js b/src/connector/connector-cli.js index 570f705..1b3cd3c 100644 --- a/src/connector/connector-cli.js +++ b/src/connector/connector-cli.js @@ -9,6 +9,13 @@ const config = { }; export default class ConnectorCli { + /** + * Create a ConnectorCli + * @param {Object} config + * @param {string} config.classPath - The path to the Jar files to be included + * @param {string} config.mainClass - The name of the Java class that represents the main program + * @param {string} config.props The - path to the properties file (for example, language specific) + */ constructor({ classPath = config.classPath, mainClass = config.mainClass, diff --git a/src/connector/connector-server.js b/src/connector/connector-server.js index 30b3f78..1663f8f 100644 --- a/src/connector/connector-server.js +++ b/src/connector/connector-server.js @@ -5,14 +5,26 @@ const config = { }; export default class ConnectorServer { + /** + * Create a ConnectorServer + * @param {Object} config + * @param {string} config.dsn - The StanfordCoreNLPServer dsn (example: 'http://localhost:9000') + */ constructor({ dsn = config.dsn }) { this.dsn = dsn; } /** + * @param {Object} config + * @param {Array.<string>} config.annotators - The list of annotators that edfines the pipeline + * @param {string} config.text - The text to run the pipeline against + * @param {Object} config.options - Additinal options (properties) for the pipeline + * @param {string} config.language - Language full name in CamelCase (eg. Spanish) + * @param {(''|'tokensregex'|'semgrex'|'tregex')} [utility] - Name of the utility to use + * NOTE: most of the utilities receives properties, these should be passed via the options param * @returns {Promise.<Object>} */ - get({ annotators, text, options, language }) { + get({ annotators, text, options, language, utility = '' }) { const properties = { annotators: annotators.join(), ...options, @@ -23,15 +35,21 @@ export default class ConnectorServer { let queryString = `pipelineLanguage=${language}&properties=${JSON.stringify(properties)}`; /** - * @todo - * Refactor this different case as a strategy not dependant on the connector necessarily. - * The conenctor should support extensibility to special cases like `tokensregex`. + * @description + * The conenctor should support extensibility to special tools: + * - For example, Semgrex is an utility that runs in a separate url Hanlder + * in StanfordCoreNLPServer + * This url is /semgrex, and apart of the normal options, it expects the + * query-string `pattern` as a must. This `pattern` option is taken here from + * the options object, form the key `semgrex.pattern`. */ - if (annotators.indexOf('regexner') > -1) { + if (utility) { // https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#query-tokensregex-tokensregex - baseUrl += '/tokensregex'; - queryString += `&pattern=${encodeURI(properties['regexner.validpospattern'])}`; - delete properties['regexner.validpospattern']; + baseUrl += `/${utility}`; + queryString += `&${Object.keys(options) + .filter(opt => opt.indexOf((`${utility}.`) === 0)) + .map(opt => `${opt.replace(`${utility}.`, '')}=${encodeURI(options[opt])}`) + .join('&')}`; } const rpOpts = { diff --git a/src/index.js b/src/index.js index e8dc2bd..a1d51be 100644 --- a/src/index.js +++ b/src/index.js @@ -1,6 +1,7 @@ import Document from './simple/document'; import Sentence from './simple/sentence'; import Token from './simple/token'; +import Expression from './simple/expression'; import Annotable from './simple/annotable'; import Annotator from './simple/annotator'; import TokenizerAnnotator from './simple/annotator/tokenize'; @@ -38,6 +39,7 @@ export default { Document, Sentence, Token, + Expression, // namespace for default annotators // Predefined annotators @see {@link https://stanfordnlp.github.io/CoreNLP/annotators.html} annotator: { diff --git a/src/pipeline.js b/src/pipeline.js index a974bcd..f127347 100644 --- a/src/pipeline.js +++ b/src/pipeline.js @@ -10,6 +10,12 @@ import depparse from './simple/annotator/depparse'; import relation from './simple/annotator/relation'; import regexner from './simple/annotator/regexner'; +import { + TokensRegexAnnotator, + SemgrexAnnotator, + TregexAnnotator, +} from './simple/expression'; + const ANNOTATORS_BY_KEY = { tokenize, ssplit, @@ -73,6 +79,55 @@ export default class Pipeline { return annotable; } + async annotateTokensRegex(annotable) { + annotable.fromJson(await this._service.getTokensRegexData( + annotable.text(), + annotable.pattern(), + this._getAnnotatorsKeys(), + this._getAnnotatrosOptions())); + + annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]); + annotable.addAnnotator(TokensRegexAnnotator); + + return annotable; + } + + async annotateSemgrex(annotable) { + annotable.fromJson(await this._service.getSemgrexData( + annotable.text(), + annotable.pattern(), + this._getAnnotatorsKeys(), + this._getAnnotatrosOptions())); + + annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]); + annotable.addAnnotator(SemgrexAnnotator); + + return annotable; + } + + async annotateTregex(annotable) { + annotable.fromJson(await this._service.getTregexData( + annotable.text(), + annotable.pattern(), + this._getAnnotatorsKeys(), + this._getAnnotatrosOptions())); + + annotable.setLanguageISO(LANGUAGE_TO_ISO2[this._language]); + annotable.addAnnotator(TregexAnnotator); + + return annotable; + } + + async _semgrex(text, pattern) { + const data = await this._service.getSemgrexData( + text, + pattern, + this._getAnnotatorsKeys(), + this._getAnnotatrosOptions()); + + return data; + } + /** * @private * @returns {Aray.<string>} annotators - those set for this pipeline diff --git a/src/service.js b/src/service.js index bd88363..050d0d3 100644 --- a/src/service.js +++ b/src/service.js @@ -20,6 +20,45 @@ export default class Service { }); } + getTokensRegexData(text, pattern, annotators, options = {}) { + return this._connector.get({ + annotators, + text, + options: { + ...options, + 'tokensregex.pattern': pattern, + }, + language: this._language.toLowerCase(), + utility: 'tokensregex', + }); + } + + getSemgrexData(text, pattern, annotators, options = {}) { + return this._connector.get({ + annotators, + text, + options: { + ...options, + 'semgrex.pattern': pattern, + }, + language: this._language.toLowerCase(), + utility: 'semgrex', + }); + } + + getTregexData(text, pattern, annotators, options = {}) { + return this._connector.get({ + annotators, + text, + options: { + ...options, + 'tregex.pattern': pattern, + }, + language: this._language.toLowerCase(), + utility: 'tregex', + }); + } + static getTokenPosInfo(pos, languageISO) { try { // eslint-disable-next-line global-require, import/no-dynamic-require diff --git a/src/simple/expression.js b/src/simple/expression.js new file mode 100644 index 0000000..1a1d34a --- /dev/null +++ b/src/simple/expression.js @@ -0,0 +1,140 @@ +import Annotable from './annotable'; +import Annotator from './annotator'; + +/** + * A Match of either `TokensRegex`, `Semrgex` or `Tregex`. + * @typedef Match + * @property {number} begin - word begin position, starting from zero + * @property {number} end - word end position, starting from zero (no match ends at 0) + * @property {string} text - matched text + * @property {string} $[label] - any label, as defined in the expression pattern + */ +export class Match { + /** + * Returns the text for the given labeled word + * @param {string} labelName - The labeed name + * @returns {string} text - The text that matched that label + */ + label(labelName) { + return this._data[`_${labelName}`]; + } + + /** + * Update an instance of Expression with data provided by a JSON + * @param {ExpressionJSON} data - The expression data, as returned by CoreNLP API service + * @returns {Expression} expression - The current expression instance + */ + fromJson(data) { + this._data = data; + return this; + } + + toJSON() { + return { ...this._data }; + } + + /** + * Get an instance of Match from a given JSON + * @param {MatchJSON} data - The match data, as returned by CoreNLP API service + * @returns {Match} match - A new Match instance + */ + static fromJson(data) { + const instance = new this(); + return instance.fromJson(data); + } +} + +export class TokensRegexAnnotator extends Annotator { } +export class SemgrexAnnotator extends Annotator { } +export class TregexAnnotator extends Annotator { } + +/** + * The CoreNLP API JSON structure representing an expression + * This expression structure can be found as the output of `TokensRegex`, + * `Semrgex` and `Tregex`. + * @typedef ExpressionJSON + * @property {number} index + * @property {Array.<Array.<Match>>} sentences + */ + +/** + * Class representing an Expression. + * @extends Annotable + */ +export default class Expression extends Annotable { + /** + * Create an Expression + * @param {string} text + * @param {string} pattern - Either `TokensRegex`, `Semrgex` or `Tregex` valid pattern + */ + constructor(text, pattern) { + super(text); + this._pattern = pattern; + this._sentences = []; + } + + /** + * Get a string representation + * @return {string} expression + */ + toString() { + return this._text || this._sentences.map(sent => sent.toString()).join('. '); + } + + /** + * Get the pattern + * @returns {string} pattern - The expression pattern + */ + pattern() { + return this._pattern; + } + + /** + * Get a list of sentences + * @returns {Array.<Sentence>} sentences - The expression sentences + */ + sentences() { + return this._sentences; + } + + /** + * Get the sentence for a given index + * @param {number} index - The position of the sentence to get + * @returns {Sentence} sentence - The expression sentences + */ + sentence(index) { + return this.sentences()[index]; + } + + /** + * Update an instance of Expression with data provided by a JSON + * @param {ExpressionJSON} data - The expression data, as returned by CoreNLP API service + * @returns {Expression} expression - The current expression instance + */ + fromJson(data) { + if (data.sentences) { + this._sentences = data.sentences.map(sent => + Object.keys(sent).map(matchIndex => + (matchIndex !== 'length' ? Match.fromJson(sent[matchIndex]) : false)) + .filter(Boolean)); + } + return this; + } + + toJSON() { + return { + text: this._text, + sentences: this._sentences, + }; + } + + /** + * Get an instance of Expression from a given JSON + * @param {ExpressionJSON} data - The expression data, as returned by CoreNLP API service + * @returns {Expression} expression - A new Expression instance + */ + static fromJson(data) { + const instance = new this(); + return instance.fromJson(data); + } +}