<a href="https://colab.research.google.com/github/Gach-omba/ML-Learning/blob/main/multilingualentityrecognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# to find out the datasets subsets available
! pip install datasets



In [13]:
from datasets import get_dataset_config_names
xtreme_subsets=get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [14]:
# find the dataset that starts with PAN
panx_subsets=[s for s in xtreme_subsets if s.startswith("PAN")]
# get the top 3
panx_subsets[:3]
# we do this to confirm the suffix and the structure of the languages

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [15]:
# we can now identify that it is the first two letters in ISO 639-1 languages
from datasets import load_dataset
load_dataset("xtreme",name="PAN-X.de")

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [16]:
# we will try to distribute the four main languages based on their spoken propotions
# to avoid sampling bias we use shuffle() and select() allows us to downsamole each corpus according to values in fracs
from collections import defaultdict
from datasets import DatasetDict
langs = ["de","fr","it","en"]
fracs=[0.629,0.229,0.084,0.059]


In [17]:
# load the monolingual corpus
panx_ch =defaultdict(DatasetDict) # return a DatasetDict if a key doesn't exist
for lang,frac in zip(langs,fracs):

  ds=load_dataset("xtreme",name=f"PAN-X.{lang}")
  # shuffle each based on the spoken proportion
  for language in ds:
    panx_ch[lang][language]=(
        ds[language]
        .shuffle(seed=0)
        .select(range(int(frac * ds[language].num_rows)))
    )


In [18]:
# view the datasets for the languages
import pandas as pd
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs},
             index=["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [19]:
# german is the most common language hence it is a good starting point to
# perform zero-shot cross-lingual transfer

# analyze the dataset to see more
element=panx_ch["de"]["train"][0]
for key,value in element.items():
  print(f"{key} : {value}")

tokens : ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags : [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs : ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [20]:
# We have seen the ner_tags that represent each id. Let's try and decrypt them
for key,value in panx_ch["de"]["train"].features.items():
  print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [21]:
tags=panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [22]:
# convert our ner_tags to strings
def create_tag_names(batch):
  return {"ner_tags_str":[tags.int2str(idx) for idx in batch["ner_tags"]]} # we are returning a data dictionary
panx_de =panx_ch["de"].map(create_tag_names)

Map:   0%|          | 0/12580 [00:00<?, ? examples/s]

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

In [23]:
de_example =panx_de["train"][0]

In [24]:
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],
             ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [25]:
# we now to check to make sure there are no imbalances in the data
from collections import Counter
split2freqs=defaultdict(Counter)
for split, dataset in panx_de.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type=tag.split("-")[1]
        split2freqs[split][tag_type] +=1
pd.DataFrame.from_dict(split2freqs,orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


In [26]:
# with this we can confidently say that the languages are evenly distributed so they can be a good measure of how the NER mode
# generalizes
# Testing between SentencePiece and WordPiece
bert_model_name="bert-base-cased"
xlmr_model_name="xlm-roberta-base"

In [28]:
from transformers import AutoTokenizer
bert_model_name="bert-base-cased"
xlmr_model_name="xlm-roberta-base"

bert_tokenizer= AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer=AutoTokenizer.from_pretrained(xlmr_model_name)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [29]:
# try on a small text
text="Jack Sparrow loves New York !"
bert_tokens=bert_tokenizer(text).tokens()
xlmr_tokens=xlmr_tokenizer(text).tokens()
print(bert_tokens)
print(xlmr_tokens)


['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '▁!', '</s>']
