# Load the data

In [1]:
import os

from tqdm.auto import tqdm

In [2]:
bo_lines, en_lines = [], []
for dataset_name in tqdm(["train", "valid", "test"]):
    with open(os.path.join(os.environ["CAI_TEMP_PATH"], "enbo_data", f"{dataset_name}.bo")) as bo_f:
        bo_lines.extend(bo_f.readlines())
    with open(os.path.join(os.environ["CAI_TEMP_PATH"], "enbo_data", f"{dataset_name}.en")) as en_f:
        en_lines.extend(en_f.readlines())

  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
len(en_lines)

663164

In [4]:
bo_text = "\n".join(bo_lines)
len(bo_text)

135200821

In [5]:
en_text = "\n".join(en_lines)
len(en_text)

160258376

# Explore

In [6]:
import unicodedata

In [7]:
bo_unique_chars = sorted(list(set(bo_text)))

In [8]:
print(' '.join(bo_unique_chars))


   ་ ། ཀ ཁ ག ང ཅ ཆ ཇ ཉ ཊ ཋ ཌ ཎ ཏ ཐ ད ན པ ཕ བ མ ཙ ཚ ཛ ཝ ཞ ཟ འ ཡ ར ལ ཤ ཥ ས ཧ ཨ ཪ ཱ ི ུ ཱུ ེ ཻ ོ ཽ ཾ ཿ ྀ ྃ ྅ ྐ ྑ ྒ ྔ ྕ ྗ ྙ ྚ ྛ ྜ ྞ ྟ ྠ ྡ ྣ ྤ ྥ ྦ ྨ ྩ ྪ ྫ ྭ ྱ ྲ ླ ྴ ྵ ྶ ྷ


In [9]:
print(unicodedata.name("\u0F20"))

TIBETAN DIGIT ZERO


In [None]:
for c in bo_unique_chars:
    try:
        print(unicodedata.name(c))
    except ValueError:
        print("No name: " + c)

---

In [10]:
en_lines[:10]

['"there is another teaching: undertaking the activities of sublime beings. "when he said the letter pa, out came the statement: \'the absolute .\' "in order to know what is possible, i rejoiced in the actions of anyone who practiced without indolence. with my attention unwavering "i rejoiced in virtue, and bearing in mind the deeds of such lords among men, rejoiced.\n',
 'those who reject or denigrate these path and practices, as well as those who have animosity or resentment toward them, will always obtain all kinds of miserable bodies upon their death.\n',
 'then, naked and with loose hair, the blessed one is endowed with hands and feet that are webbed. noble son, this is the power that ensures that bodhisattvas cannot be subdued by the attacks of opponents.\n',
 '8. "noble son, the limitless greatness of the jewels is threefold: strength of intelligence will be foremost in terms of insight. he dedicated the path of the ten virtuous actions practiced by all those beings toward unsur

In [11]:
unique_chars = sorted(list(set(en_text)))

In [12]:
print(' '.join(unique_chars))


   " ' , - . 0 1 2 3 4 5 6 7 8 9 : ? a b c d e f g h i j k l m n o p q r s t u v w x y z


In [None]:
for c in unique_chars:
    try:
        print(unicodedata.name(c))
    except ValueError:
        print("No name: " + c)

In [14]:
to_find = unique_chars[20]
print(f"{to_find} - {unicodedata.name(to_find)}")
found_lines = [(bo_line, en_line) for bo_line, en_line in zip(bo_lines, en_lines) if to_find in en_line]
len(found_lines)

b - LATIN SMALL LETTER B


590802

In [15]:
r'\u' + '%04x' % ord(to_find)

'\\u0062'

In [16]:
found_lines[:20]

[('གཞན་ཡང་ཆོས་གཅིག་སྟེ། སྐྱེས་བུ་དམ་པའི་ལས་རྣམས་ལ་ཞུགས་པའོ།པ་ཞེས་བརྗོད་པ་དང་དོན་དམ་པའི་སྒྲ་བྱུང་ངོ་།གནས་ནི་ཤེས་པར་བྱ་བའི་ཕྱིར། །གང་དག་གཡེལ་བ་མེད་སྤྱོད་པ། །དེ་ཡི་སྤྱོད་ལ་རྗེས་ཡི་རང་། །ངའི་སེམས་རྣམ་པར་མ་གཡོས་ཤིང་། །དགེ་ལ་རྗེས་སུ་ཡི་རང་སྟེ། །མི་དབང་དེ་ཡི་སྤྱོད་པ་ལ། །བསམ་པས་རྗེས་སུ་ཡི་རང་ནས།\n',
  '"there is another teaching: undertaking the activities of sublime beings. "when he said the letter pa, out came the statement: \'the absolute .\' "in order to know what is possible, i rejoiced in the actions of anyone who practiced without indolence. with my attention unwavering "i rejoiced in virtue, and bearing in mind the deeds of such lords among men, rejoiced.\n'),
 ('གང་དག་ལམ་དེ་དང་། སྒྲུབ་པ་དེའི་ཕྱིར་སྤོང་ཞིང་མི་སྙན་པར་བརྗོད་ལ། དེ་ལ་ཡང་ཀུན་ནས་མནར་སེམས་ཀྱི་སེམས་དང་ཁོང་ཁྲོ་བའི་སེམས་དང་ལྡན་ཞིང་འཆི་བས་དུས་བྱེད་པ་དེ་དག་ནི་ཐམས་ཅད་དུ་ལུས་ངན་པ་ཐམས་ཅད་ཐོབ་པར་འགྱུར་ཏེ\n',
  'those who reject or denigrate these path and practices, as well as those who have animosity or resentment toward them, will a

In [19]:
x = "asd[asdfqwer]z(x)dc"

In [22]:
x.count("d") % 2

1

In [34]:
import re

In [35]:
re_ = re.compile(f"[{'([{'}].*?[{'])}'}]")

In [36]:
re_.sub("", x)

'asdzc'

In [37]:
re_ = re.compile(f"[{'adq'}]")

In [38]:
re_.sub("", x)

's[sfwer]z(x)c'

In [41]:
set("aaavdsf")

TypeError: 'in <string>' requires string as left operand, not set