In [1]:
!pip3 install OpenNMT-py==1.2.0

Defaulting to user installation because normal site-packages is not writeable


In [37]:
import requests

def get_forms(sysID, lang, lemma_tags_set):
    """lemma_tags_set: `[{"lemma": "go", "tags": "V;PST"}, ...]`"""
    
    request_data = {
        "sysID": sysID,
        "lang": lang,
        "data": lemma_tags_set
    }
    response = requests.post('https://test2.kurdinus.com/oracle/GetForms', json=request_data)
    print(response)
    if response.status_code == 200:
        return response.content.decode()
    else:
        return "#FAILED: " + str(response.status_code)

In [38]:
def check_forms(sysID, lang, lemma_form_tags_set):
    """lemma_forms_tags_set: `[{"lemma": "go", "form": "goed", "tags": "V;PST"}, ...]`"""
    
    request_data = {
        "sysID": sysID,
        "lang": lang,
        "data": lemma_form_tags_set
    }
    response = requests.post('https://test2.kurdinus.com/oracle/CheckForms', json=request_data)
    print(response)
    if response.status_code == 200:
        return response.content.decode()
    else:
        return "#FAILED: " + str(response.status_code)

In [78]:
import pandas as pd
import random

file = pd.read_csv('lat.tsv', sep='\t', header=None).values.tolist()

def get_data(rows, function):
    """rows: `["lemma\\t form(if checking data)\\t tags", ...]`
    output: `["lemma\\t form\\t tags", ...]`"""

    lemma_forms_tags_set = []
    for row in rows:
        if row in file: file.remove(row)
        lemma_forms_tags_set.append({"lemma": row[0], "form": row[1], "tags": row[-1]})

    oracle_data = function("fumo_iteration0", "lat", lemma_forms_tags_set).split('\n')[:-1]
    return oracle_data

In [63]:

rows = random.sample(file, 1000)

# split 90-10 into train and dev
oracle_data = get_data(rows, get_forms)
train_data = oracle_data[:int(len(oracle_data)*0.9)]
dev_data = oracle_data[int(len(oracle_data)*0.9):]
print(oracle_data[:5])

<Response [200]>
['olēscō\tolēscēmus\tV;IND;ACT;FUT;1;PL', 'veneror\tvenerābāminī\tV;IND;ACT;PST;IPFV;2;PL', 'haveō\thaventur\tV;IND;PASS;PRS;3;PL', 'maereō\tmaerēbant\tV;IND;ACT;PST;IPFV;3;PL', 'polliceor\tpollicēris\tV;IND;ACT;PRS;2;SG']


In [64]:
def create_data_files(name, data):
    """data: `["lemma\\t form\\t tags", ...]`
    output: `name.src`, `name.tgt` files with the data in this format: `g o # V PST` and `w e n t`"""
    train_src = open(f'data/{name}.src', 'w')
    train_tgt = open(f'data/{name}.tgt', 'w')
    for result in data:
        if type(result) == str: result = result.split('\t')
        if len(result) == 2:
            lemma, tags = result
            word = ''
        else:
            lemma, word, tags = result
        src = ' '.join(list(lemma)) + " # " + ' '.join(tags.split(';'))
        tgt = ' '.join(list(word))
        train_src.write(src + '\n')
        train_tgt.write(tgt + '\n')
    train_src.close()
    train_tgt.close()

In [65]:
create_data_files('train', train_data)
create_data_files('dev', dev_data)

In [67]:
!onmt_preprocess -train_src data/train.src -train_tgt data/train.tgt -valid_src data/dev.src -valid_tgt data/dev.tgt -save_data run/data -overwrite

[2024-05-08 13:26:57,513 INFO] Extracting features...
[2024-05-08 13:26:57,603 INFO]  * number of source features: 0.
[2024-05-08 13:26:57,603 INFO]  * number of target features: 0.
[2024-05-08 13:26:57,603 INFO] Building `Fields` object...
[2024-05-08 13:26:57,604 INFO] Building & saving training data...
[2024-05-08 13:26:57,713 INFO] Building shard 0.
[2024-05-08 13:26:57,760 INFO]  * saving 0th train data shard to run/data.train.0.pt.
[2024-05-08 13:26:58,048 INFO]  * tgt vocab size: 35.
[2024-05-08 13:26:58,049 INFO]  * src vocab size: 47.
[2024-05-08 13:26:58,064 INFO] Building & saving validation data...
[2024-05-08 13:26:58,674 INFO] Building shard 0.
[2024-05-08 13:26:58,676 INFO]  * saving 0th valid data shard to run/data.valid.0.pt.


In [68]:
!onmt_train -data run/data -save_model run/model -encoder_type rnn -rnn_type LSTM -rnn_size 128 -layers 1 -word_vec_size 128 -save_checkpoint_steps 200 -valid_steps 200 -early_stopping 2

In [69]:
%%capture cap2 --no-stderr
!onmt_translate -model run/model_step_800.pt -src data/dev.src -output data/dev.hyp -replace_unk -verbose
with open('pred_scores.log', 'w') as f:
    f.write(cap2.stdout)

In [70]:
!julia evaluate.jl data/dev.tgt data/dev.hyp

Accuracy: 53 / 100   0.53
Character edit distance 1.31
trānstinēbimur -> trānstinēmur
apolactizābuntur -> aptizābuntur
postībimus -> postīmus
dēpraedābimur -> dēpraedēpāmur
heiulābimur -> heilābimur
populāberis -> pulāberis
praeolēbimus -> praexolēmus
oblīvīscuntur -> oblīsciuntur
belligerābiminī -> bellgībiminī
stupēscēmus -> stēscēmus
extollō -> extoll
dēpangō -> dēnsō
patrizāmur -> patrīmur
maerētis -> maerātis
dīvergēs -> dīvēs
aggeniculābāris -> aggēbāris
obsolefīet -> obsolētut
dēmorābimur -> dēmāmur
ēlūctābar -> ēlūcēbar
temnēmur -> temnāmur
māchināmur -> māchimur
cōmissābāmur -> cōssābāmur
postībō -> postīmus
obmordēbitis -> obūbētis
exequuntur -> exexuntur
argūtātur -> argūtar
adgaudent -> adgaudēs
dīlargīris -> dīlargieris
conquereris -> conqueror
sermōcinābāminī -> serminābāminī
superbīmus -> superbimus
expiscābitur -> expiētur
deōsculāminī -> dēsciminī
adfremis -> adfrēs
sortiar -> sortābor
oblīvīscēris -> oblīscēris
nepōtābāris -> nēbāris
petissiminī -> petissāminī
subvolv

In [71]:
import re

def get_sorted(text = None):
    if text==None: 
        with open('pred_scores.log', 'r') as file:
            lines = file.readlines()
    else: lines = text.split('\n')

    data = []
    lemma = ''
    tags = ''
    score = 0
    for i in range(len(lines)):
        if lines[i].startswith('SENT'):
            lemma_tags = re.findall(r'\[\'(.*)\'\]', lines[i])[0]
            lemma, tags = lemma_tags.split('#')
            lemma = lemma.replace("'", '').replace(', ', '')
            tags = tags.replace("'", '').replace(', ', ';')[1:]
        elif lines[i].startswith('PRED SCORE'):
            score = float(lines[i].split(':')[1].strip())
        elif lines[i].startswith('PRED'):
            prediction = lines[i].split(':')[1].strip().replace(' ', '')
            data.append((lemma, prediction, tags, score))

    data.sort(key=lambda x: x[3])
    return data

In [73]:
%%capture cap2 --no-stderr
create_data_files("test", file)
!onmt_translate -model run/model_step_800.pt -src data/test.src -output data/test.hyp -replace_unk -verbose
with open('pred_scores.log', 'w') as f:
    f.write(cap2.stdout)

data = get_sorted()
data = [x[:-1] for x in data]

most_confident = data[-100:]
least_confident = data[:300]
most_confident_results = get_data(most_confident, check_forms)[:-1]
least_confident_results = get_data(least_confident, check_forms)[:-1]

most_confident_incorrect = []
for i, result in enumerate(most_confident_results):
    if result.split('\t')[1] != most_confident[i][1]:
        most_confident_incorrect.append(result)
least_confident_incorrect = []
for i, result in enumerate(least_confident_results):
    if result.split('\t')[1] != least_confident[i][1]:
        least_confident_incorrect.append(result)

incorrect = most_confident_incorrect + least_confident_incorrect
train_data += [x for x in incorrect if x not in data]
create_data_files('train', train_data)

!onmt_preprocess -train_src data/train.src -train_tgt data/train.tgt -valid_src data/dev.src -valid_tgt data/dev.tgt -save_data run/data -overwrite
!onmt_train -data run/data -save_model run/model -encoder_type rnn -rnn_type LSTM -rnn_size 128 -layers 1 -word_vec_size 128 -save_checkpoint_steps 200 -valid_steps 200 -early_stopping 2

In [80]:
len(least_confident), len(least_confident_incorrect)

(300, 281)

In [75]:
%%capture cap2 --no-stderr
create_data_files("test", file)
!onmt_translate -model run/model_step_800.pt -src data/test.src -output data/test.hyp -replace_unk -verbose
with open('pred_scores.log', 'w') as f:
    f.write(cap2.stdout)

data = get_sorted()
data = [x[:-1] for x in data]

most_confident = data[-100:]
least_confident = data[:300]
most_confident_results = get_data(most_confident, check_forms)[:-1]
least_confident_results = get_data(least_confident, check_forms)[:-1]

most_confident_incorrect = []
for i, result in enumerate(most_confident_results):
    if result.split('\t')[1] != most_confident[i][1]:
        most_confident_incorrect.append(result)
least_confident_incorrect = []
for i, result in enumerate(least_confident_results):
    if result.split('\t')[1] != least_confident[i][1]:
        least_confident_incorrect.append(result)

incorrect = most_confident_incorrect + least_confident_incorrect
train_data += [x for x in incorrect if x not in data]
create_data_files('train', train_data)

!onmt_preprocess -train_src data/train.src -train_tgt data/train.tgt -valid_src data/dev.src -valid_tgt data/dev.tgt -save_data run/data -overwrite
!onmt_train -data run/data -save_model run/model -encoder_type rnn -rnn_type LSTM -rnn_size 128 -layers 1 -word_vec_size 128 -save_checkpoint_steps 200 -valid_steps 200 -early_stopping 2

In [None]:
least_confident

In [53]:
%%capture cap2 --no-stderr
create_data_files("test", file)
!onmt_translate -model run/model_step_800.pt -src data/test.src -output data/test.hyp -replace_unk -verbose
with open('pred_scores.log', 'w') as f:
    f.write(cap2.stdout)

data = get_sorted()
data = [x[:-1] for x in data]

most_confident = data[-100:]
least_confident = data[:300]
most_confident_results = get_data(most_confident, check_forms)[:-1]
least_confident_results = get_data(least_confident, check_forms)[:-1]

most_confident_incorrect = []
for i, result in enumerate(most_confident_results):
    if result.split('\t')[1] != most_confident[i][1]:
        most_confident_incorrect.append(result)
least_confident_incorrect = []
for i, result in enumerate(least_confident_results):
    if result.split('\t')[1] != least_confident[i][1]:
        least_confident_incorrect.append(result)

incorrect = most_confident_incorrect + least_confident_incorrect
train_data += [x for x in incorrect if x not in data]
create_data_files('train', train_data)

!onmt_preprocess -train_src data/train.src -train_tgt data/train.tgt -valid_src data/dev.src -valid_tgt data/dev.tgt -save_data run/data -overwrite
!onmt_train -data run/data -save_model run/model -encoder_type rnn -rnn_type LSTM -rnn_size 128 -layers 1 -word_vec_size 128 -save_checkpoint_steps 200 -valid_steps 200 -early_stopping 2

In [54]:
least_confident

[('fēteō', 'fētēmur', 'V;IND;PASS;PRS;1;PL'),
 ('antepolleō', 'antepollēbit', 'V;IND;ACT;FUT;3;SG'),
 ('masturbor', 'mastur', 'V;IND;ACT;PRS;3;SG'),
 ('quadripartiō', 'quadripartipartipābāris', 'V;IND;PASS;PST;IPFV;2;SG'),
 ('masturbor', 'mastur', 'V;IND;ACT;PRS;3;PL'),
 ('ōrdior', 'ōriāris', 'V;IND;ACT;PRS;2;SG'),
 ('auxiliō', 'auciliābāmus', 'V;IND;ACT;PST;IPFV;1;PL'),
 ('recommoneō', 'recommormēbant', 'V;IND;ACT;PST;IPFV;3;PL'),
 ('ōrdior', 'rōiāberis', 'V;IND;ACT;FUT;2;SG'),
 ('ōrdior', 'ōriābimur', 'V;IND;ACT;FUT;1;PL'),
 ('attolerō', 'attābāris', 'V;IND;PASS;PST;IPFV;2;SG'),
 ('antepolleō', 'antepollēbimus', 'V;IND;ACT;FUT;1;PL'),
 ('recommoneō', 'recommorat', 'V;IND;ACT;PRS;3;SG'),
 ('supergredior', 'supergiēmur', 'V;IND;ACT;FUT;1;PL'),
 ('recommoneō', 'recommormēbunt', 'V;IND;ACT;FUT;3;PL'),
 ('diffīdō', 'diffīdētis', 'V;IND;ACT;PRS;2;PL'),
 ('attollō', 'atttābam', 'V;IND;ACT;PST;IPFV;1;SG'),
 ('stinguō', 'stingingētis', 'V;IND;ACT;FUT;2;PL'),
 ('coniūcundor', 'coniūcumābar', '

In [55]:
%%capture cap2 --no-stderr
create_data_files("test", file)
!onmt_translate -model run/model_step_800.pt -src data/test.src -output data/test.hyp -replace_unk -verbose
with open('pred_scores.log', 'w') as f:
    f.write(cap2.stdout)

data = get_sorted()
data = [x[:-1] for x in data]

most_confident = data[-100:]
least_confident = data[:300]
most_confident_results = get_data(most_confident, check_forms)[:-1]
least_confident_results = get_data(least_confident, check_forms)[:-1]

most_confident_incorrect = []
for i, result in enumerate(most_confident_results):
    if result.split('\t')[1] != most_confident[i][1]:
        most_confident_incorrect.append(result)
least_confident_incorrect = []
for i, result in enumerate(least_confident_results):
    if result.split('\t')[1] != least_confident[i][1]:
        least_confident_incorrect.append(result)

incorrect = most_confident_incorrect + least_confident_incorrect
train_data += [x for x in incorrect if x not in data]
create_data_files('train', train_data)

!onmt_preprocess -train_src data/train.src -train_tgt data/train.tgt -valid_src data/dev.src -valid_tgt data/dev.tgt -save_data run/data -overwrite
!onmt_train -data run/data -save_model run/model -encoder_type rnn -rnn_type LSTM -rnn_size 128 -layers 1 -word_vec_size 128 -save_checkpoint_steps 200 -valid_steps 200 -early_stopping 2

In [56]:
least_confident

[('trānspiciō', 'trānsicicicābāmur', 'V;IND;PASS;PST;IPFV;1;PL'),
 ('trānspiciō', 'trānsicicicicicicicicicicābor', 'V;IND;PASS;FUT;1;SG'),
 ('trānspiciō', 'trānsicicicicicicicicābantur', 'V;IND;PASS;PST;IPFV;3;PL'),
 ('trānspiciō', 'trānsicicicicicicicicicābāris', 'V;IND;PASS;PST;IPFV;2;SG'),
 ('trānspiciō', 'trānsicicicicicicicicicābāris', 'V;IND;PASS;PST;IPFV;2;SG'),
 ('trānspiciō', 'trānsicicicicicicicicābātur', 'V;IND;PASS;PST;IPFV;3;SG'),
 ('trānspiciō', 'trānsicicicicicicicicicinābuntur', 'V;IND;PASS;FUT;3;PL'),
 ('trānspiciō', 'trānsicicicicicābimur', 'V;IND;PASS;FUT;1;PL'),
 ('trānspiciō', 'trānsicicicicicicicicicicābitur', 'V;IND;PASS;FUT;3;SG'),
 ('trānspiciō', 'trānsicicicicicicicicicāberis', 'V;IND;PASS;FUT;2;SG'),
 ('auxiliō', 'auniliābit', 'V;IND;ACT;FUT;3;SG'),
 ('trānspiciō', 'trānsicicimur', 'V;IND;PASS;PRS;1;PL'),
 ('trānspiciō', 'trānsicicicicicicicicicābāminī', 'V;IND;PASS;PST;IPFV;2;PL'),
 ('plipiō', 'plipipipiēbam', 'V;IND;ACT;PST;IPFV;1;SG'),
 ('adloquor', 'adloq

In [77]:
import subprocess
# now predict all data in `file`
# with open('pred_scores.log', 'w') as f:
#     f.write('')

# print(len(file)//100)
# for i in range(len(file)//100):
#     create_data_files("test", file[100*i : len(file) if 100*(i+1)>len(file) else 100*(i+1)])
#     command = "onmt_translate -model run/model_step_800.pt -src data/test.src -output data/test.hyp -replace_unk -verbose"
#     process = subprocess.run(command, shell=True)
#     data = process.stdout
#     data = [x[:-1] for x in get_sorted(data)]
#     print(len(data))
#     results += get_data(data, check_forms)
create_data_files("test", file)
command = "onmt_translate -model run/model_step_800.pt -src data/test.src -output data/test.hyp -replace_unk -verbose"
process = subprocess.run(command, shell=True)
data = process.stdout
data = [x[:-1] for x in get_sorted(data)]
results = get_data(data, check_forms)


[2024-05-09 01:37:41,631 INFO] Translating shard 0.
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
[2024-05-09 01:37:42,179 INFO] 
SENT 1: ['i', 'm', 'i', 't', 'o', 'r', '#', 'V', 'IND', 'ACT', 'PRS', '1', 'SG']
PRED 1: i m i t o r
PRED SCORE: -0.0066

[2024-05-09 01:37:42,181 INFO] 
SENT 2: ['i', 'm', 'i', 't', 'o', 'r', '#', 'V', 'IND', 'ACT', 'PRS', '2', 'SG']
PRED 2: i m i t ā r i s
PRED SCORE: -0.0692

[2024-05-09 01:37:42,184 INFO] 
SENT 3: ['i', 'm', 'i', 't', 'o', 'r', '#', 'V', 'IND', 'ACT', 'PRS', '2', 'SG']
PRED 3: i m i t ā r i s
PRED SCORE: -0.0692

[2024-05-09 01:37:42,186 INFO] 
SENT 4: ['i', 'm', 'i', 't', 'o', 'r', '#', 'V', 'IND', 'ACT', 'PRS', '3', 'SG']
PRED 4: i m i t ā t u r
PRED SCORE: -0.7012

[2024-05-09 01:37:42,186 INFO] 
SENT 5: ['i', 'm', 'i', 't', 'o', 'r', '#', 'V', 'IND', 'ACT', 'PRS', '1', 'PL']
PRED 5: i m i t ā m u r
PRED SCORE: -0.0545

[2024-05-09 01:37:42,187 INFO] 
SENT 6: ['i', 'm', 'i', 't', 'o', 'r', '#', 'V', 'IND', 'AC

<Response [200]>


In [58]:
with open('predictions', 'w') as f:
    for row in results:
        f.write(row + '\n')

In [59]:
results

['trānspiciō\ttrānspiciēbāmur\tV;IND;PASS;PST;IPFV;1;PL',
 'trānspiciō\ttrānspiciar\tV;IND;PASS;FUT;1;SG',
 'trānspiciō\ttrānspiciēbantur\tV;IND;PASS;PST;IPFV;3;PL',
 'trānspiciō\ttrānspiciēbāris\tV;IND;PASS;PST;IPFV;2;SG',
 'trānspiciō\ttrānspiciēbāris\tV;IND;PASS;PST;IPFV;2;SG',
 'trānspiciō\ttrānspiciēbātur\tV;IND;PASS;PST;IPFV;3;SG',
 'trānspiciō\ttrānspicientur\tV;IND;PASS;FUT;3;PL',
 'trānspiciō\ttrānspiciēmur\tV;IND;PASS;FUT;1;PL',
 'trānspiciō\ttrānspiciētur\tV;IND;PASS;FUT;3;SG',
 'trānspiciō\ttrānspiciēris\tV;IND;PASS;FUT;2;SG',
 'auxiliō\tauxiliābit\tV;IND;ACT;FUT;3;SG',
 'trānspiciō\ttrānspicimur\tV;IND;PASS;PRS;1;PL',
 'trānspiciō\ttrānspiciēbāminī\tV;IND;PASS;PST;IPFV;2;PL',
 'plipiō\tplipiābam\tV;IND;ACT;PST;IPFV;1;SG',
 'adloquor\tadloquor\tV;IND;ACT;PRS;1;SG',
 'lipiō\tlipiet\tV;IND;ACT;FUT;3;SG',
 'circumvehor\tcircumveheris\tV;IND;ACT;PRS;2;SG',
 'circumvehor\tcircumvehitur\tV;IND;ACT;PRS;3;SG',
 'micciō\tmicciēmus\tV;IND;ACT;FUT;1;PL',
 'cōnsipiō\tcōnsipiēs\tV;IND;A

In [60]:
!curl -X 'POST' \
  'https://test2.kurdinus.com/Oracle/GetAnalysis?sysID=fumo&lang=lat' \
  -H 'accept: */*' \
  -H 'Content-Type: multipart/form-data' \
  -F 'file=@predictions'