In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-small-noisy-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-small-noisy-bahasa-cased/checkpoint-150000',
 'finetune-t5-small-noisy-bahasa-cased/checkpoint-160000',
 'finetune-t5-small-noisy-bahasa-cased/checkpoint-170000',
 'finetune-t5-small-noisy-bahasa-cased/checkpoint-180000',
 'finetune-t5-small-noisy-bahasa-cased/checkpoint-190000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [4]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hi guys! I noticed yesterday and today many have got these cookies, right? So today I want to share some of our first batch of mortem posts:</s>


In [5]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai kawan-kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:</s>


In [6]:
strings = [
    'ak tak paham la',
    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',
    'Jadi haram jadah😀😃🤭',
    'nak gi mana tuu',
    'Macam nak ambil half day',
    'jadi aku tadi bikin ini gengs dan dijual haha salad only k dan haha drinks only k',
    'nanti aku tengok dulu tiket dari Kuala Lumpur pukul berapa ada ya',
    "Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.",
]
for s in strings:
    input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')
    outputs = model.generate(input_ids, max_length = 100)
    print(tokenizer.decode(outputs[0]))

<pad> I don't understand la</s>
<pad> At 8 o'clock in the market, it's a great place to choose.</s>
<pad> So it's a fucking shit.</s>
<pad> Where do you want to go?</s>
<pad> It's like taking half a day</s>
<pad> I've been making this gengs and sold haha salad only k and haha drinks only k.</s>
<pad> I'll see what tickets from Kuala Lumpur are at.</s>
<pad> Imagine PH and win pru-14. There are many back doorways. Last-last Ismail Sabri goes up. That's why I don't give a fk about politics anymore. I swear I'm fk up.</s>


In [7]:
strings = [
    'u ni, talk properly lah',
    "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍",
    'Me after seeing this video: mm dapnya burger benjo extra mayo',
    'power lah even shopback come to edmw riao',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
]
for s in strings:
    input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')
    outputs = model.generate(input_ids, max_length = 100)
    print(tokenizer.decode(outputs[0]))

<pad> u ni, bercakap lah</s>
<pad> baru sahaja menghadiri majlis perkahwinan sepupu saya. pelik jugak dia buat biasa2 je sebab gaya hidupnya kelihatan mewah. kemudian saya dapati mereka akan berbulan madu 3 minggu. keputusan pintar <unk> </s>
<pad> Saya selepas melihat video ini: saya dapnya burger benjo extra mayo</s>
<pad> power lah even shopback datang edmw riao</s>
<pad> Hai kawan-kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:</s>


In [None]:
model.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased', organization='mesolitica')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443


In [None]:
tokenizer.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased', organization='mesolitica')

In [None]:
!cp -r finetune-t5-tiny-noisy-bahasa-cased/runs finetune-noisy-translation-t5-tiny-bahasa-cased
!cd finetune-noisy-translation-t5-tiny-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push

In [8]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [9]:
from unidecode import unidecode
import json

with open('test-noisy-shuffled.json') as fopen:
    test = fopen.read().split('\n')
    test = [json.loads(t) for t in test if len(t)]
    
len(test)

6854

In [10]:
from tqdm import tqdm

batch_size = 1

results_en_ms, filtered_right_en_ms = [], []
results_ms_en, filtered_right_ms_en = [], []
for i in tqdm(range(len(test))):
    t = test[i]['translation']
    p = t['prefix']
    s = t['src']
    tgt = t['tgt']
    
    input_ids = [{'input_ids': tokenizer.encode(f'{p}{s}', return_tensors = 'pt')[0]}]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)[0]
    o = tokenizer.decode(outputs, skip_special_tokens=True)
    if len(o):
        if 'Inggeris ke Melayu' in p:
            results_en_ms.append(o)
            filtered_right_en_ms.append(tgt)
        else:
            results_ms_en.append(o)
            filtered_right_ms_en.append(tgt)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 6854/6854 [25:43<00:00,  4.44it/s]


In [11]:
len(results_en_ms), len(results_ms_en)

(2937, 3917)

In [12]:
refs = [filtered_right_en_ms]
sys = results_en_ms
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 41.15794003172596,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 hyp_len = 63689 ref_len = 64473)',
  'bp': 0.9877656378545313,
  'counts': [45968, 29622, 19968, 13610],
  'totals': [63689, 60752, 57815, 54878],
  'sys_len': 63689,
  'ref_len': 64473,
  'precisions': [72.17572893278275,
   48.758888596260206,
   34.537749718931074,
   24.800466489303545],
  'prec_str': '72.2/48.8/34.5/24.8',
  'ratio': 0.9878398709537326},
 chrF2++ = 65.51)

In [13]:
refs = [filtered_right_ms_en]
sys = results_ms_en
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 41.83407099646298,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 hyp_len = 91952 ref_len = 92985)',
  'bp': 0.9888287449603974,
  'counts': [65929, 42830, 29766, 20815],
  'totals': [91952, 88035, 84118, 80201],
  'sys_len': 91952,
  'ref_len': 92985,
  'precisions': [71.6993648860275,
   48.65110467427728,
   35.386005373404025,
   25.95354172641239],
  'prec_str': '71.7/48.7/35.4/26.0',
  'ratio': 0.9888906812926817},
 chrF2++ = 64.52)