Skip to content

Commit

Permalink
textproc/py-tokenizers: add port: Fast state-of-the-art tokenizers op…
Browse files Browse the repository at this point in the history
…timized for research and production

Provides an implementation of today's most used tokenizers, with a
focus on performance and versatility.

Main features:
- Train new vocabularies and tokenize, using today's most used
  tokenizers.
- Extremely fast (both training and tokenization), thanks to the Rust
  implementation. Takes less than 20 seconds to tokenize a GB of text
  on a server's CPU.
- Easy to use, but also extremely versatile.
- Designed for research and production.
- Normalization comes with alignments tracking. It's always possible
  to get the part of the original sentence that corresponds to a given
  token.
- Does all the pre-processing: Truncate, Pad, add the special tokens
  your model needs.

WWW: https://github.com/huggingface/tokenizers
  • Loading branch information
tagattie committed Feb 12, 2024
1 parent 54cee7b commit e3dfc2f
Show file tree
Hide file tree
Showing 5 changed files with 496 additions and 0 deletions.
1 change: 1 addition & 0 deletions textproc/Makefile
Expand Up @@ -1618,6 +1618,7 @@
SUBDIR += py-tiktoken
SUBDIR += py-tinycss
SUBDIR += py-tinycss2
SUBDIR += py-tokenizers
SUBDIR += py-toml
SUBDIR += py-tomli
SUBDIR += py-tomli-w
Expand Down
29 changes: 29 additions & 0 deletions textproc/py-tokenizers/Makefile
@@ -0,0 +1,29 @@
PORTNAME= tokenizers
DISTVERSION= 0.15.1
CATEGORIES= textproc python
MASTER_SITES= PYPI
PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
DISTFILES= ${PORTNAME}-${PORTVERSION}${EXTRACT_SUFX}

MAINTAINER= tagattie@FreeBSD.org
COMMENT= Fast state-of-the-art tokenizers optimized for research and production
WWW= https://github.com/huggingface/tokenizers

LICENSE= APACHE20

BUILD_DEPENDS= ${PYTHON_PKGNAMEPREFIX}maturin>=1.0<2.0:devel/py-maturin@${PY_FLAVOR}
RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}huggingface-hub>=0.16.4<1.0:misc/py-huggingface-hub@${PY_FLAVOR}

USES= cargo python
USE_PYTHON= autoplist pep517

CARGO_CARGOTOML=${WRKSRC}/bindings/python/Cargo.toml
CARGO_CARGOLOCK=${WRKSRC}/bindings/python/Cargo.lock
CARGO_BUILD= no
CARGO_INSTALL= no
CARGO_TEST= no

post-install:
@${FIND} ${STAGEDIR}${PYTHON_SITELIBDIR} -type f -name '*.so' -exec ${STRIP_CMD} {} ';'

.include <bsd.port.mk>
149 changes: 149 additions & 0 deletions textproc/py-tokenizers/Makefile.crates
@@ -0,0 +1,149 @@
CARGO_CRATES= aho-corasick-1.1.2 \
anstream-0.6.5 \
anstyle-1.0.4 \
anstyle-parse-0.2.3 \
anstyle-query-1.0.2 \
anstyle-wincon-3.0.2 \
autocfg-1.1.0 \
base64-0.13.1 \
bitflags-1.3.2 \
bitflags-2.4.1 \
cc-1.0.83 \
cfg-if-1.0.0 \
clap-4.4.11 \
clap_builder-4.4.11 \
clap_derive-4.4.7 \
clap_lex-0.6.0 \
colorchoice-1.0.0 \
console-0.15.7 \
crossbeam-deque-0.8.4 \
crossbeam-epoch-0.9.16 \
crossbeam-utils-0.8.17 \
darling-0.14.4 \
darling_core-0.14.4 \
darling_macro-0.14.4 \
derive_builder-0.12.0 \
derive_builder_core-0.12.0 \
derive_builder_macro-0.12.0 \
either-1.9.0 \
encode_unicode-0.3.6 \
env_logger-0.10.1 \
errno-0.3.8 \
esaxx-rs-0.1.10 \
fastrand-2.0.1 \
fnv-1.0.7 \
getrandom-0.2.11 \
heck-0.4.1 \
hermit-abi-0.3.3 \
humantime-2.1.0 \
ident_case-1.0.1 \
indicatif-0.17.7 \
indoc-2.0.4 \
instant-0.1.12 \
is-terminal-0.4.9 \
itertools-0.11.0 \
itoa-1.0.10 \
lazy_static-1.4.0 \
libc-0.2.151 \
linux-raw-sys-0.4.12 \
lock_api-0.4.11 \
log-0.4.20 \
macro_rules_attribute-0.2.0 \
macro_rules_attribute-proc_macro-0.2.0 \
matrixmultiply-0.3.8 \
memchr-2.6.4 \
memoffset-0.9.0 \
minimal-lexical-0.2.1 \
monostate-0.1.10 \
monostate-impl-0.1.10 \
ndarray-0.15.6 \
nom-7.1.3 \
num-complex-0.4.4 \
num-integer-0.1.45 \
num-traits-0.2.17 \
number_prefix-0.4.0 \
numpy-0.20.0 \
once_cell-1.19.0 \
onig-6.4.0 \
onig_sys-69.8.1 \
parking_lot-0.12.1 \
parking_lot_core-0.9.9 \
paste-1.0.14 \
pkg-config-0.3.27 \
portable-atomic-1.6.0 \
ppv-lite86-0.2.17 \
proc-macro2-1.0.70 \
pyo3-0.20.2 \
pyo3-build-config-0.20.2 \
pyo3-ffi-0.20.2 \
pyo3-macros-0.20.2 \
pyo3-macros-backend-0.20.2 \
quote-1.0.33 \
rand-0.8.5 \
rand_chacha-0.3.1 \
rand_core-0.6.4 \
rawpointer-0.2.1 \
rayon-1.8.0 \
rayon-cond-0.3.0 \
rayon-core-1.12.0 \
redox_syscall-0.4.1 \
regex-1.10.2 \
regex-automata-0.4.3 \
regex-syntax-0.7.5 \
regex-syntax-0.8.2 \
rustc-hash-1.1.0 \
rustix-0.38.28 \
ryu-1.0.16 \
scopeguard-1.2.0 \
serde-1.0.193 \
serde_derive-1.0.193 \
serde_json-1.0.108 \
smallvec-1.11.2 \
spm_precompiled-0.1.4 \
strsim-0.10.0 \
syn-1.0.109 \
syn-2.0.41 \
target-lexicon-0.12.12 \
tempfile-3.8.1 \
termcolor-1.4.0 \
thiserror-1.0.51 \
thiserror-impl-1.0.51 \
unicode-ident-1.0.12 \
unicode-normalization-alignments-0.1.12 \
unicode-segmentation-1.10.1 \
unicode-width-0.1.11 \
unicode_categories-0.1.1 \
unindent-0.2.3 \
utf8parse-0.2.1 \
wasi-0.11.0+wasi-snapshot-preview1 \
winapi-0.3.9 \
winapi-i686-pc-windows-gnu-0.4.0 \
winapi-util-0.1.6 \
winapi-x86_64-pc-windows-gnu-0.4.0 \
windows-sys-0.45.0 \
windows-sys-0.48.0 \
windows-sys-0.52.0 \
windows-targets-0.42.2 \
windows-targets-0.48.5 \
windows-targets-0.52.0 \
windows_aarch64_gnullvm-0.42.2 \
windows_aarch64_gnullvm-0.48.5 \
windows_aarch64_gnullvm-0.52.0 \
windows_aarch64_msvc-0.42.2 \
windows_aarch64_msvc-0.48.5 \
windows_aarch64_msvc-0.52.0 \
windows_i686_gnu-0.42.2 \
windows_i686_gnu-0.48.5 \
windows_i686_gnu-0.52.0 \
windows_i686_msvc-0.42.2 \
windows_i686_msvc-0.48.5 \
windows_i686_msvc-0.52.0 \
windows_x86_64_gnu-0.42.2 \
windows_x86_64_gnu-0.48.5 \
windows_x86_64_gnu-0.52.0 \
windows_x86_64_gnullvm-0.42.2 \
windows_x86_64_gnullvm-0.48.5 \
windows_x86_64_gnullvm-0.52.0 \
windows_x86_64_msvc-0.42.2 \
windows_x86_64_msvc-0.48.5 \
windows_x86_64_msvc-0.52.0

0 comments on commit e3dfc2f

Please sign in to comment.