In [1]:
# ! python -m spacy download en_core_web_trf

In [2]:
! rm -rf /tmp/dask-worker-space
! rm -rf /home/eeisenst/workspace/compassionai/garland/notebooks/dask-worker-space

# Dataset preparation

In [3]:
import os
import copy
import spacy
import pickle
import random
import logging

from tqdm.auto import tqdm
from cai_common.data import ParallelTMXLoader
from dask.distributed import Client, LocalCluster

In [4]:
dask_logger = logging.getLogger("distributed.utils_perf")
dask_logger.setLevel(logging.ERROR)

dask_client = Client(LocalCluster(
    n_workers=20,
    threads_per_worker=1
))

In [5]:
parallel_df = ParallelTMXLoader().apply_markup().clean_bad_chars().dataframe.compute()
parallel_df

Unnamed: 0,filename,tohoku,folio,position,tibetan,english
0,Toh_384-Glorious_King_of_Tantras_That_Resolves...,384,F.187.a,1,དཔལ་རྡོ་རྗེ་སེམས་དཔའ་ལ་ཕྱག་འཚལ་ལོ། །,I pay homage to Glorious Vajrasattva!
1,Toh_384-Glorious_King_of_Tantras_That_Resolves...,384,F.187.a,39,འདི་སྐད་བདག་གིས་ཐོས་པའི་དུས་གཅིག་ན། །,Thus have I heard at one time.
2,Toh_384-Glorious_King_of_Tantras_That_Resolves...,384,F.187.a,204,དེ་ནས་བྱང་ཆུབ་སེམས་དཔའ་རྡོ་རྗེ་སྙིང་པོ་ལ་སོགས་...,"Then, the entourage, including bodhisattva Vaj..."
3,Toh_384-Glorious_King_of_Tantras_That_Resolves...,384,F.187.a,322,ཕྱི་ནང་གསང་བའི་མཆོད་པས་མཆོད་ནས་འདི་སྐད་ཅེས་གསོ...,"made outer, inner, and secret offerings, and a..."
4,Toh_384-Glorious_King_of_Tantras_That_Resolves...,384,F.187.a,388,ཀྱེ་ཧོ་བཅོམ་ལྡན་རྡོ་རྗེ་འཛིན། །,O Blessed Vajra Holder!
...,...,...,...,...,...,...
19,Toh_309-The_Sutra_on_Impermanence-v1.tmx,309,F.155.b,650,ནད་མེད་མི་རྟག་ལང་ཚོ་རྟག་མ་ཡིན། །འབྱོར་པ་མི་རྟག...,"“Good health is impermanent, Youth does not la..."
20,Toh_309-The_Sutra_on_Impermanence-v1.tmx,309,F.155.b,757,སྐྱེ་བོ་མི་རྟག་ཉིད་ཀྱིས་ཉེན་གྱུར་ན། །འདོད་པའི་...,"How can beings, afflicted as they are by imper..."
21,Toh_309-The_Sutra_on_Impermanence-v1.tmx,309,F.155.b,858,བཅོམ་ལྡན་འདས་ཀྱིས་དེ་སྐད་ཅེས་བཀའ་སྩལ་ནས། དགེ་ས...,"When the Bhagavān had thus spoken, the monks r..."
22,Toh_309-The_Sutra_on_Impermanence-v1.tmx,309,F.155.b,935,མི་རྟག་པ་ཉིད་ཀྱི་མདོ་རྫོགས་སོ།། །།,This completes “The Sūtra on Impermanence.”


In [6]:
random.seed(42)

In [7]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf")

## Test

In [None]:
i = 0

In [None]:
example = parallel_df.english.iloc[i]
parsed = nlp(example)

print(example)
print()
for token in parsed:
    print(f"{token.text:<20} {token.lemma_:<20} {token.pos_}")

i += 1

## Run

In [8]:
eligible_pos = {'VERB', 'NOUN', 'ADJ', 'PROPN'}
num_positives = 3000
negative_ratio = 3

In [9]:
positive_examples = []

records = random.choices(parallel_df.to_dict(orient="records"), k=num_positives)

for record in tqdm(records):
    parsed = nlp(record['english'])
    for token in filter(lambda token: token.pos_ in eligible_pos, parsed):
        positive_examples.append({
            'source': record['tibetan'],
            'target': token.lemma_,
            'pos': token.pos_,
            'label': 1
        })

  0%|          | 0/3000 [00:00<?, ?it/s]

In [10]:
from pattern.en import NOUN, VERB, ADJECTIVE
from pattern.en.wordnet import synsets

pos_map = {
    'NOUN': NOUN,
    'PROPN': NOUN,
    'VERB': VERB,
    'ADJ': ADJECTIVE
}

In [11]:
by_tag = {tag: [] for tag in eligible_pos}

for example in tqdm(positive_examples):
    synonyms = synsets(example['target'], pos_map[example['pos']])
    if len(synonyms) == 0:
        by_tag[example['pos']].append([example['target'].lower()])
    else:
        all_synonyms = sorted(list(set([synonym.replace('_', ' ') for synonym in synonyms[0].synonyms])))
        by_tag[example['pos']].append(all_synonyms)

  0%|          | 0/20983 [00:00<?, ?it/s]

In [12]:
negative_examples = []

for example in tqdm(positive_examples):
    blacked_out_targets = {example['target']}
    for _ in range(negative_ratio):
        example = copy.deepcopy(example)
        while True:
            target = random.choice(
                random.choice(
                    by_tag[example['pos']]
                )    # First choose a meaning
            )        #     ...then choose a synonym
            if not target in blacked_out_targets:
                break
        example['target'] = target
        example['label'] = 0
        blacked_out_targets.add(target)
        negative_examples.append(example)

  0%|          | 0/20983 [00:00<?, ?it/s]

In [13]:
final_dataset = positive_examples + negative_examples
len(final_dataset)

83932

In [14]:
dataset_fn = os.path.join(os.environ['CAI_TEMP_PATH'], "temp_data/aligner_dataset.pkl")
dataset_fn

'/home/eeisenst/workspace/temp/temp_data/aligner_dataset.pkl'

In [15]:
with open(dataset_fn, 'wb') as f:
    pickle.dump(final_dataset, f)