# Evaluate the encoder

In [4]:
import logging
from dask.distributed import Client, LocalCluster

In [5]:
dask_logger = logging.getLogger("distributed.utils_perf")
dask_logger.setLevel(logging.ERROR)

In [6]:
dask_client = Client(LocalCluster(
    n_workers=20,
    threads_per_worker=1))

In [7]:
dask_client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 10
Total threads: 10,Total memory: 31.16 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36517,Workers: 10
Dashboard: http://127.0.0.1:8787/status,Total threads: 10
Started: Just now,Total memory: 31.16 GiB

0,1
Comm: tcp://127.0.0.1:42505,Total threads: 1
Dashboard: http://127.0.0.1:37641/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:44401,
Local directory: /tmp/dask-worker-space/worker-kmq_zdxz,Local directory: /tmp/dask-worker-space/worker-kmq_zdxz

0,1
Comm: tcp://127.0.0.1:35621,Total threads: 1
Dashboard: http://127.0.0.1:43323/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:44755,
Local directory: /tmp/dask-worker-space/worker-be111zzj,Local directory: /tmp/dask-worker-space/worker-be111zzj

0,1
Comm: tcp://127.0.0.1:46055,Total threads: 1
Dashboard: http://127.0.0.1:38553/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:40535,
Local directory: /tmp/dask-worker-space/worker-iss5orz_,Local directory: /tmp/dask-worker-space/worker-iss5orz_

0,1
Comm: tcp://127.0.0.1:33275,Total threads: 1
Dashboard: http://127.0.0.1:42585/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:32891,
Local directory: /tmp/dask-worker-space/worker-w0zp8mx9,Local directory: /tmp/dask-worker-space/worker-w0zp8mx9

0,1
Comm: tcp://127.0.0.1:36183,Total threads: 1
Dashboard: http://127.0.0.1:35859/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:42697,
Local directory: /tmp/dask-worker-space/worker-ngykbjle,Local directory: /tmp/dask-worker-space/worker-ngykbjle

0,1
Comm: tcp://127.0.0.1:38531,Total threads: 1
Dashboard: http://127.0.0.1:45377/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:43547,
Local directory: /tmp/dask-worker-space/worker-bgwdnxr8,Local directory: /tmp/dask-worker-space/worker-bgwdnxr8

0,1
Comm: tcp://127.0.0.1:44677,Total threads: 1
Dashboard: http://127.0.0.1:44701/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:38177,
Local directory: /tmp/dask-worker-space/worker-vzj_5uz1,Local directory: /tmp/dask-worker-space/worker-vzj_5uz1

0,1
Comm: tcp://127.0.0.1:35023,Total threads: 1
Dashboard: http://127.0.0.1:34757/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:42105,
Local directory: /tmp/dask-worker-space/worker-_cpfs_64,Local directory: /tmp/dask-worker-space/worker-_cpfs_64

0,1
Comm: tcp://127.0.0.1:40439,Total threads: 1
Dashboard: http://127.0.0.1:42351/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:46665,
Local directory: /tmp/dask-worker-space/worker-tvfno4pw,Local directory: /tmp/dask-worker-space/worker-tvfno4pw

0,1
Comm: tcp://127.0.0.1:39015,Total threads: 1
Dashboard: http://127.0.0.1:38795/status,Memory: 3.12 GiB
Nanny: tcp://127.0.0.1:46259,
Local directory: /tmp/dask-worker-space/worker-o0w090fk,Local directory: /tmp/dask-worker-space/worker-o0w090fk


## Unks

In [20]:
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer

from cai_common.data import KangyurLoader, TengyurLoader

In [2]:
tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="bod_Tibt"
)

In [4]:
kangyur_df = KangyurLoader().remove_new_lines().dataframe.compute()

In [5]:
tengyur_df = TengyurLoader().remove_new_lines().dataframe.compute()

In [6]:
len(kangyur_df), len(tengyur_df)

(65461, 128630)

In [12]:
text = kangyur_df.text.tolist()
text.extend(tengyur_df.text.tolist())
len(text)

194091

In [14]:
tokenized = [tokenizer.encode(l) for l in tqdm(text)]

  0%|          | 0/194091 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (6539 > 1024). Running this sequence through the model will result in indexing errors


### Total Unk counts

In [17]:
flattened = [t for l in tokenized for t in l]
len(flattened)

87885924

In [26]:
token_counts = pd.Series(flattened).value_counts()

In [28]:
token_counts[tokenizer.unk_token_id] / token_counts.sum()

0.001979452363725504

### Unk densities

In [29]:
unk_counts = [sum([t == tokenizer.unk_token_id for t in l]) for l in tqdm(tokenized)]

  0%|          | 0/194091 [00:00<?, ?it/s]

In [39]:
unk_counts = pd.Series(unk_counts).value_counts().sort_index()
unk_counts

0      161945
1       11327
2        5460
3        3162
4        2171
        ...  
128         1
160         1
161         1
200         1
373         1
Length: 94, dtype: int64

In [43]:
1 - unk_counts[unk_counts > 1].sum() / unk_counts.sum()

8.243555857823104e-05

## Tibetan vocabulary

In [4]:
import unicodedata

from transformers import AutoTokenizer

In [47]:
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="bod_Tibt")

In [41]:
list(tokenizer.vocab.keys())[0]

'▁indflydelse'

In [48]:
def _is_tibetan(word):
    excluded = {"▁", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "฀", "", "", ""}
    return all(['TIBETAN' in unicodedata.name(c) for c in word if not c in excluded])

In [49]:
tibetan = [_is_tibetan(w) for w in list(tokenizer.vocab.keys())]

In [50]:
sum(tibetan)

2951

In [45]:
test_str = "།མུ་སྟེགས་ཚོགས་རྣམས་ཐམས་ཅད་རབ་བཅོམ་སྟེ།"

In [46]:
tokenizer.decode(tokenizer.encode(test_str))

'།མུ་སྟེགས་ཚོགས་རྣམས་ཐམས་ཅད་རབ་བཅོམ་སྟེ།</s>bod_Tibt'