##  dataset

 Using a subset of the Cross-lingual TRansfer Evaluation of Multilingual Encoders
(XTREME) benchmark called WikiANN or PAN-X

In [9]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names('xtreme')
print(f"XTREME has {len(xtreme_subsets)} configurations")

text = 'Jeff Dean is a computer scientist at Google in California'

panx_subsets = [s for s in xtreme_subsets if s.startswith('PAN')]
print(panx_subsets)

from datasets import load_dataset

dataset = load_dataset('xtreme', name='PAN-X.zh')
print(dataset)

XTREME has 183 configurations
['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de', 'PAN-X.el', 'PAN-X.en', 'PAN-X.es', 'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi', 'PAN-X.fr', 'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it', 'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko', 'PAN-X.ml', 'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl', 'PAN-X.pt', 'PAN-X.ru', 'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te', 'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr', 'PAN-X.ur', 'PAN-X.vi', 'PAN-X.yo', 'PAN-X.zh']


Generating train split: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 319146.57 examples/s]
Generating validation split: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 1477804.24 examples/s]
Generating test split: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 1599536.27 examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})





In [22]:
from collections import defaultdict
from datasets import DatasetDict
import pandas as pd

langs = ['zh', 'fr', 'ja', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)
idx = 0
for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
    if idx == 0:
        print('ds:\n\r')
        print(ds)
    for split in ds:
        panx_ch[lang][split] = (ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows))))

pd.DataFrame({
    lang: [panx_ch[lang]['train'].num_rows] for lang in langs
}, index=["Number of training examples"])

# panx_zh show case
items = panx_ch['zh']['train'][0]
for key, value in items.items():
    print(f"key: {key}, value: {value}")
# datasets features
for k, v in panx_ch['zh']['train'].features.items():
    print(f"key: {k}, value: {v}")

tags = panx_ch['zh']['train'].features['ner_tags'].feature
print(tags)


def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(label) for label in batch['ner_tags']]}


panx_zh = panx_ch['zh'].map(create_tag_names)

zh_example = panx_zh['train'][0]
pd.DataFrame([zh_example['tokens'], zh_example['ner_tags_str']], ['Token', 'Tags'])

# count frequency 
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_zh.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

ds:

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})
ds:

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})
ds:

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
  

Unnamed: 0,PER,ORG,LOC
train,4899,4815,5437
validation,2398,2487,2603
test,2487,2381,2728


In [24]:
from transformers import AutoTokenizer, AutoConfig

bert_model_name = 'bert-base-cased'
xmlr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xmlr_tokenizer = AutoTokenizer.from_pretrained(xmlr_model_name)

text = "李宁是个运动员"
bert_tokens = bert_tokenizer(text).tokens()
print(bert_tokens)
xmlr_tokens = xmlr_tokenizer(text).tokens()
print(xmlr_tokens)


['[CLS]', '李', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[SEP]']
['<s>', '▁李', '宁', '是个', '运动员', '</s>']


## Tokenizer pipeline

分词器工作流程<br><br>
<img src="./imgs/tokenizer_pipeline.png" width="400"/>

**Normalization** 规范化

原始字符串清理，包括去空格、去除重音字符(&eacute; &uuml; &ntilde;)等。针对unicode字符规范化操作只要用于解决相同字符可能有不同表示形式的问题，这会导致计算机认为是不同字符。常见的unicode规范化方案
如NFC、NFD、NFKC、NFKD会将这些不同表示方式替换为标准形式，以确保字符串的一致性。如 &eacute;可以表示为：
* NFC (标准合成形式):  &eacute;(单个字符 U+00E9)
* NFD（标准分解形式）：e + ´（U+0065 + U+0301）

还有一种规范化方式：大写转小写，可以减少词表大小。

**Pretokenization** 预分词 用于对输入文本进行预处理，以便更好地进行后续的分词操作。
1.主要作用
* 提高分词一致性：处理变音符号、标点符号、大小写等问题，使相同语义的文本表现一致。
* 减少分词歧义：将复杂的文本结构转换成更易分词的格式，例如拆分连字符、去除额外的空格等。
* 提升模型效果：减少无关的文本噪音，优化分词策略，使词表更稳定。
2.常见操作
* Unicode 规范化
* 去除或标准化重音符号："résumé" → "resume"（去掉重音符号） "naïve" → "naive"（处理变音符号）
* 标点符号和空格处理："Hello,world!" → "Hello , world !"（添加空格，防止单词与标点符号粘连）/ "Hello world" → "Hello world"（去除多余空格）
* 拆分复合词："e-mail" → "e mail" "I'll" → ["I", "'ll"]

**Tokenizer Model**

拆分字或者词至更小单元，以减少词表大小，减少超出词表token数量。常见的子词分词算法：BPE（），Unigram，WordPiece。<br><br>
1. BPE: Byte pair encoding 基于统计的合并策略，迭代合并最频繁的字符串/字词对，构建词汇表；先把单词拆字符，再逐步合并成高频相邻子词，最终形成稳定子词单元。
2. Unigram： Unigram Language level 基于概率的子词选择策略。删除低概率子词，以找到最优子词集合。
3. WordPiece：类似BPE，基于最大似然估计（MLE）而不是基于频率合并。主要用于谷歌的NLP模型，如Bert。

几种算法对比：<br><br>

| **算法** | **策略** | **训练方式** | **是否可回溯** | **计算复杂度** | **应用** |
| --- | --- | --- | --- | --- | --- |
| **BPE** | 频率最高的子词合并 | 迭代式合并 | ❌ 不可回溯 | ✅ 快速 | GPT-2, SentencePiece |
| **Unigram** | 通过删除低概率子词优化 | 反复计算概率 | ✅ 可回溯 | ❌ 计算复杂 | T5, ALBERT |
| **WordPiece** | 基于最大似然估计（MLE） | 迭代式合并 | ❌ 不可回溯 | ❌ 计算复杂 | BERT, DistilBERT |

**Postprocessing** 后处理

做一些特殊的转换，如给输入序列的token串添加特殊token，如Bert的tokenizer会增加分割服务tokenizer: `[CLS, jack,
spa, rrow, loves, new, york, !, SEP]`

SentencePiece Tokenizer 基于Unigram，将每个输入文本编码成unicode字符，使SentencePiece tokenizer不受重音符号、标点符号的影响，且能够适用于诸如日语等不适用空格字符的语言，同时能够允许它毫无歧义的将token
还原成原本的文本。
```
"".join(xlmr_tokens).replace(u"\u2581", " ")
'<s> Jack Sparrow loves New York!</s>'
```

## Creating a custom model for classification


In [None]:
import torch.nn as nn
import torch
from transformers import XLMRobertaConfig, AutoConfig, AutoTokenizer
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        #set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        #load initial weights
        self.init_weights()
    
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        # use body get encoder representations
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        # apply classifier to encoder representation
        sequence_output = self.dropout(outputs)
        logits = self.classifier(sequence_output)
        # calculate loss
        loss= None
        if labels is not None:
            loss_fac = nn.CrossEntropyLoss()
            loss = loss_fac(logits.view(-1, self.num_labels), labels.view(-1))
        # return model output
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

xmlr_model_name = 'xlm-roberta-base'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_config = AutoConfig.from_pretrained(xmlr_model_name)
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xmlr_model_name, config=xlmr_config).to(device))

xmlr_tokenizer = AutoTokenizer.from_pretrained(xmlr_model_name)
input_ids = xmlr_tokenizer.encode(text, return_tensors="pt")
xlmr_tokens = xmlr_tokenizer(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

outputs = xlmr_model(input_ids.to(device)).logits
preds = torch.argmax(outputs, dim=-1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")