Skip to content

Commit

Permalink
Implement gpt2 (BPE) GGUF tokenizer conversion (#397)
Browse files Browse the repository at this point in the history
* Implement gpt2 gguf tokenizer

* Fix unk tok calculation

* Remove normalizer

* Update gguf tokenizer

* Allow adding unk token when found

* Add unk token to builder if provided.

* Improve add_special_tokens

* Use tokenizerx builder

* Add useful comment

Co-authored-by: Brennan Kinney <5098581+polarathene@users.noreply.github.com>

* Bump version to 0.1.16 (#404)

* Bump version to 0.1.17

* Fix version bump

* Add and update template READMEs (#405)

* Add readmes

* Fix typos

* Improve Rust docs (#406)

* Expose phi3v loader and remove unused deps (#408)

* Support format for mixtral where experts are in one tensor (#355)

* Normal loading metadata for vision models (#409)

* Phi 3 vision ISQ support (#410)

* ISQ support for phi3v

* Document it

* Remove causal masks cache (#412)

* Fix: use new slice_assign (#415)

* Use new slice_assign

* Fix dead image links

* Fix Phi-3 GGUF (#414)

* Fix kv head usage

* Fix rope weights

* Clippy

* Work on the gpt2 conversion

* Add comment

* Add some tests

* Update readme

---------

Co-authored-by: Brennan Kinney <5098581+polarathene@users.noreply.github.com>
  • Loading branch information
EricLBuehler and polarathene committed Jun 10, 2024
1 parent 37c6726 commit 46b0364
Show file tree
Hide file tree
Showing 2 changed files with 219 additions and 18 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,8 @@ The following tokenizer model types are currently supported. If you would like o
please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face.

**Supported GGUF tokenizer types**
- `llama`
- `llama` (sentencepiece)
- `gpt2` (BPE)

## Run with the CLI

Expand Down
234 changes: 217 additions & 17 deletions mistralrs-core/src/gguf/gguf_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
use std::sync::atomic::Ordering;
// https://github.com/huggingface/transformers/blob/8685b3c5d2dd2550527773d2a02499495a759e31/src/transformers/convert_slow_tokenizer.py

use std::{collections::HashMap, sync::atomic::Ordering};

use anyhow::Result;
use candle_core::quantized::gguf_file::Content;
use itertools::Itertools;
use tokenizers::{
decoders::{self, byte_fallback::ByteFallback, fuse::Fuse, strip::Strip},
models::unigram::Unigram,
decoders::{
self, byte_fallback::ByteFallback, byte_level::ByteLevel, fuse::Fuse, strip::Strip,
},
models::{bpe::BpeBuilder, unigram::Unigram},
normalizers::{self, Prepend, Replace},
pre_tokenizers,
processors::{
self,
template::{self, TemplateProcessing},
},
AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
};
use tracing::info;
Expand All @@ -29,6 +39,7 @@ struct PropsGGUF {
unk: Option<u32>,
eos: u32,
bos: u32,
add_bos_token: Option<bool>,
}

impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
Expand All @@ -47,12 +58,19 @@ impl TryFrom<ContentMetadata<'_>> for PropsGGUF {
unk: c.get_value("unknown_token_id").ok(),
eos: c.get_value("eos_token_id")?,
bos: c.get_value("bos_token_id")?,
add_bos_token: c.get_value("add_bos_token").ok(),
};

Ok(props)
}
}

struct AddedTokensCollection {
bos: String,
eos: String,
unk: Option<String>,
}

pub fn convert_gguf_to_hf_tokenizer(content: &Content) -> Result<GgufTokenizerConversion> {
let metadata = ContentMetadata {
path_prefix: "tokenizer.ggml",
Expand All @@ -62,6 +80,7 @@ pub fn convert_gguf_to_hf_tokenizer(content: &Content) -> Result<GgufTokenizerCo

let (tokenizer, kind, special_tokens) = match props.model.as_str() {
"llama" | "replit" => unigram_tokenizer(&props)?,
"gpt2" => bpe_tokenizer(&props)?,
other => {
anyhow::bail!("Tokenizer model `{other}` not supported.");
}
Expand All @@ -79,26 +98,55 @@ pub fn convert_gguf_to_hf_tokenizer(content: &Content) -> Result<GgufTokenizerCo
info!("Tokenizer: {tokenizer:?}");
}

let [bos_str, eos_str, unk_str] = special_tokens
.try_into()
.or_else(|_| anyhow::bail!("Tokenizer is missing required special tokens"))?;
let AddedTokensCollection { bos, eos, unk } = special_tokens;

Ok(GgufTokenizerConversion {
tokenizer,
bos: Some(bos_str),
eos: Some(eos_str),
unk: Some(unk_str),
bos: Some(bos),
eos: Some(eos),
unk,
})
}

// TODO: Add support for additional tokenizer models: BPE, WordPiece, WordLevel
// TODO: Add support for additional tokenizer models: WordPiece, WordLevel
// https://docs.rs/tokenizers/latest/tokenizers/models/enum.ModelWrapper.html
#[derive(Debug)]
enum TokenizerKind {
Unigram,
Bpe,
}

/// Add the special tokens and return their string representations
fn add_special_tokens(
p: &PropsGGUF,
tokenizer: &mut Tokenizer,
bos: u32,
eos: u32,
unk: Option<u32>,
) -> AddedTokensCollection {
// Add special tokens (bos, eos, unk):
let mut special_tokens: [Option<String>; 3] = Default::default();

// A little bit awkward here since eos/bos are assumed not options so we need to handle an Option
for (i, token_id) in [Some(bos), Some(eos), unk].into_iter().enumerate() {
if let Some(token_id) = token_id {
let token = p.tokens[token_id as usize].as_str();
special_tokens[i] = Some(token.to_string());
tokenizer.add_special_tokens(&[AddedToken::from(token.to_string(), true)]);
}
}

// Destructure array of options:
let [bos_str, eos_str, unk_str] = special_tokens;
// Would need to unwrap bos/eos here, or change the struct types
AddedTokensCollection {
bos: bos_str.unwrap(),
eos: eos_str.unwrap(),
unk: unk_str,
}
}

fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, Vec<String>)> {
fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> {
let PropsGGUF { unk, eos, bos, .. } = *p;
// Unigram (SentencePiece) default UNK is 0
let unk = unk.unwrap_or(0);
Expand Down Expand Up @@ -140,15 +188,84 @@ fn unigram_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, Vec<Str
.build()?;

// Add special tokens (bos, eos, unk):
let mut special_tokens = Vec::<String>::new();
for token_id in [bos, eos, unk] {
let token = p.tokens[token_id as usize].as_str();
let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, Some(unk));

Ok((tokenizer, TokenizerKind::Unigram, special_tokens))
}

special_tokens.push(token.to_owned());
tokenizer.add_special_tokens(&[AddedToken::from(token.to_owned(), true)]);
fn bpe_tokenizer(p: &PropsGGUF) -> Result<(Tokenizer, TokenizerKind, AddedTokensCollection)> {
// BPE merges have each string item as a space-delimited pair:
// https://github.com/EricLBuehler/mistral.rs/pull/397#discussion_r1631988370
let merges = p
.merges
.as_ref()
.ok_or(anyhow::Error::msg("BPE tokenizer must include merges"))?
.iter()
.map(|merge| {
let split: (&str, &str) = merge
.splitn(2, ' ')
.collect_tuple()
.expect("Failed to convert split into 2-tuple");
(split.0.to_string(), split.1.to_string())
})
.collect::<Vec<_>>();

let mut vocab = HashMap::new();
for (i, token) in p.tokens.iter().enumerate() {
#[allow(clippy::cast_possible_truncation)]
vocab.insert(token.clone(), i as u32);
}

Ok((tokenizer, TokenizerKind::Unigram, special_tokens))
let PropsGGUF {
eos,
bos,
unk,
add_bos_token,
..
} = *p;

let mut bpe = BpeBuilder::new().vocab_and_merges(vocab, merges);
if let Some(unk) = unk {
bpe = bpe.unk_token(p.tokens[unk as usize].to_string());
};

let bpe = bpe.build().map_err(anyhow::Error::msg)?;

let mut tokenizer = TokenizerX::try_builder()
.with_model(bpe)
.with_decoder(Decoder::ByteLevel(true, true, true))
.build()?;
tokenizer.with_pre_tokenizer(pre_tokenizers::byte_level::ByteLevel::new(
false, true, true,
));
if add_bos_token.is_some_and(|x| x) {
let mut special_toks = HashMap::new();
special_toks.insert(
p.tokens[bos as usize].clone(),
template::SpecialToken::new(
p.tokens[bos as usize].clone(),
vec![bos],
vec![p.tokens[bos as usize].clone()],
)
.unwrap(),
);
tokenizer.with_post_processor(
TemplateProcessing::builder()
.try_single(format!("{}:0 $A:0", p.tokens[bos as usize]))
.unwrap()
.try_pair(format!("{}:0 $A:0 $B:1", p.tokens[bos as usize]))
.unwrap()
.special_tokens(special_toks)
.build()
.unwrap(),
);
} else {
tokenizer.with_post_processor(processors::byte_level::ByteLevel::new(true, false, true));
}

let special_tokens = add_special_tokens(p, &mut tokenizer, bos, eos, unk);

Ok((tokenizer, TokenizerKind::Bpe, special_tokens))
}

// This is a workaround to have a better builder API.
Expand Down Expand Up @@ -187,6 +304,7 @@ enum Decoder<'a> {
Replace(&'a str, &'a str),
Strip(char, usize, usize),
Sequence(Vec<Self>),
ByteLevel(bool, bool, bool),
}

// Convert into upstream type wrapped enum variants:
Expand All @@ -209,6 +327,9 @@ impl TryFrom<Decoder<'_>> for DecoderWrapper {

decoders::sequence::Sequence::new(seq).into()
}
Decoder::ByteLevel(add_prefix_space, trim_offsets, use_regex) => {
ByteLevel::new(add_prefix_space, trim_offsets, use_regex).into()
}
};

Ok(value)
Expand Down Expand Up @@ -285,6 +406,24 @@ mod tests {
.map_err(anyhow::Error::msg)
.map(|res| res.tokenizer)
}
TokenizerType::Gpt2 => {
let api = ApiBuilder::new().with_progress(true).build().unwrap();
let api = api.repo(Repo::with_revision(
"QuantFactory/Meta-Llama-3-8B-Instruct-GGUF".to_string(),
RepoType::Model,
"main".to_string(),
));

let filename = api.get("Meta-Llama-3-8B-Instruct.Q2_K.gguf").unwrap();
let mut file = std::fs::File::open(&filename)?;
convert_gguf_to_hf_tokenizer(
&Content::read(&mut file)
.map_err(|e| e.with_path(filename))
.map_err(anyhow::Error::msg)?,
)
.map_err(anyhow::Error::msg)
.map(|res| res.tokenizer)
}
other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"),
}
}
Expand All @@ -302,6 +441,17 @@ mod tests {
let tokenizer_filename = api.get("tokenizer.json").unwrap();
Ok(Tokenizer::from_file(tokenizer_filename).unwrap())
}
TokenizerType::Gpt2 => {
let api = ApiBuilder::new().with_progress(true).build().unwrap();
let api = api.repo(Repo::with_revision(
"EricB/mistralrs_tests".to_string(),
RepoType::Model,
"main".to_string(),
));

let tokenizer_filename = api.get("tokenizer_gpt2.json").unwrap();
Ok(Tokenizer::from_file(tokenizer_filename).unwrap())
}
other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"),
}
}
Expand Down Expand Up @@ -362,6 +512,31 @@ mod tests {
Ok(())
}

#[test]
fn test_encode_gpt2() -> Result<()> {
let passage = get_test_passage();
let hf_tokenizer = get_hf_tokenizer(TokenizerType::Gpt2)?;
let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Gpt2)?;

// Without adding special tokens
let hf_decoded = codec_roundtrip(&hf_tokenizer, passage.as_str(), false)?;
let gguf_decoded = codec_roundtrip(&gguf_tokenizer, passage.as_str(), false)?;
assert_eq!(hf_decoded, gguf_decoded);
assert_eq!(passage, gguf_decoded);

// With special tokens added
// SKIPPED:
// - Bugged the GGUF tokenizer does not prepend `<s> `
// - Due to HF tokenizer using BPE (tokenizer.json) while GGUF tokenizer uses Unigram (metadata)?
/*
let hf_decoded = codec_roundtrip(&hf_tokenizer, passage.as_str(), true)?;
let gguf_decoded = codec_roundtrip(&gguf_tokenizer, passage.as_str(), true)?;
assert_eq!(hf_decoded, gguf_decoded);
*/

Ok(())
}

#[test]
fn test_decode_llama() -> Result<()> {
use rand::seq::SliceRandom;
Expand All @@ -386,4 +561,29 @@ mod tests {

Ok(())
}

#[test]
fn test_decode_gpt2() -> Result<()> {
use rand::seq::SliceRandom;
use rand::thread_rng;

let hf_tokenizer = get_hf_tokenizer(TokenizerType::Gpt2)?;
let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Gpt2)?;

#[allow(clippy::cast_possible_truncation)]
let mut tokens = (0..hf_tokenizer.get_vocab_size(false) as u32).collect::<Vec<_>>();
tokens.shuffle(&mut thread_rng());

// Without skipping special tokens
let hf_decoded = decode(&hf_tokenizer, &tokens, false)?;
let gguf_decoded = decode(&gguf_tokenizer, &tokens, false)?;
assert_eq!(hf_decoded, gguf_decoded);

// With skipping special tokens
let hf_decoded = decode(&hf_tokenizer, &tokens, true)?;
let gguf_decoded = decode(&gguf_tokenizer, &tokens, true)?;
assert_eq!(hf_decoded, gguf_decoded);

Ok(())
}
}

0 comments on commit 46b0364

Please sign in to comment.