Skip to content

Commit

Permalink
+Phi2 pre tokenizer template
Browse files Browse the repository at this point in the history
  • Loading branch information
CrispStrobe committed May 11, 2024
1 parent e53a9fb commit b1dc90e
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4435,14 +4435,20 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "qwen2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if (
tokenizer_pre == "phi2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PHI2;
} else if (
tokenizer_pre == "olmo") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else if (
tokenizer_pre == "dbrx") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
LLAMA_LOG_WARN("%s: Encountered unknown pre-tokenizer type. Falling back to default. \n", __func__);
LLAMA_LOG_WARN("%s: GENERATION QUALITY MIGHT BE DEGRADED! \n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
// throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
} else {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
Expand Down Expand Up @@ -12278,7 +12284,7 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
word_collection = unicode_regex_split(text, {
"[\r\n]",
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
Expand Down Expand Up @@ -12333,6 +12339,7 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
});
break;
case LLAMA_VOCAB_PRE_TYPE_PHI2:
default:
// default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, {
Expand Down

0 comments on commit b1dc90e

Please sign in to comment.