Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

Commit

Permalink
feat: add sentence validator for catalan (#606)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmigual authored Feb 19, 2022
1 parent 18fa4b2 commit 1096aef
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
2 changes: 2 additions & 0 deletions server/lib/validation/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const defaultValidator = require('./languages/default');
const bas = require('./languages/bas');
const ca = require('./languages/ca');
const ckb = require('./languages/ckb');
const en = require('./languages/en');
const eo = require('./languages/eo');
Expand All @@ -16,6 +17,7 @@ const yue = require('./languages/yue');

const VALIDATORS = {
bas,
ca,
ckb,
en,
eo,
Expand Down
38 changes: 38 additions & 0 deletions server/lib/validation/languages/ca.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
const tokenizeWords = require('talisman/tokenizers/words/gersam');

// Minimum of words that qualify as a sentence.
const MIN_WORDS = 1;

// Maximum of words allowed per sentence to keep recordings in a manageable duration.
const MAX_WORDS = 14;

const INVALIDATIONS = [{
fn: (sentence) => {
const words = tokenizeWords('ca', sentence);
return words.length < MIN_WORDS || words.length > MAX_WORDS;
},
error: `El nombre de paraules ha de ser entre ${MIN_WORDS} i ${MAX_WORDS} (inclòs)`,
}, {
regex: /[0-9]+/,
error: 'La frase no pot contenir nombres',
}, {
// This could mean multiple sentences per line.
regex: /[?!.].+/,
error: 'La frase no pot contenir signes de puntuació al mig',
}, {
// Symbols not allowed, also add them below as well to the regex:
// < > + * \ # @ ^ “ ” ‘ ’ ( ) [ ] / { }
regex: /[<>+*\\#@^“”‘’(){}[\]/]|\s{2,}|!{2,}/,
error: 'La frase no pot contenir simbols o multiples espais o exclamacions',
}, {
// Any words consisting of uppercase letters or uppercase letters with a period
// inbetween are considered abbreviations or acronyms.
// This currently also matches fooBAR but we most probably don't want that either
// as users wouldn't know how to pronounce the uppercase letters.
regex: /[A-Z]{2,}|[A-Z]+\.*[A-Z]+/,
error: 'La frase no pot contenir abreviacions o acrònims',
}];

module.exports = {
INVALIDATIONS,
};

0 comments on commit 1096aef

Please sign in to comment.