In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [3]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-base-standard-bahasa-cased-combined/checkpoint-*'))
checkpoints

['finetune-t5-base-standard-bahasa-cased-combined/checkpoint-640000',
 'finetune-t5-base-standard-bahasa-cased-combined/checkpoint-650000',
 'finetune-t5-base-standard-bahasa-cased-combined/checkpoint-660000']

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
# model2 = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')

In [6]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:</s>


In [7]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> Can you solve it?</s>


In [8]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> Brother, I want to copy it on Facebook.. haha. If you can't, I'll</s>


In [9]:
input_ids = tokenizer.encode("terjemah pasar Melayu ke Melayu: Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Sesungguhnya. Ini bukan pakar, saya tahu. Ia isyarat, bodoh.</s>


In [10]:
input_ids = tokenizer.encode("terjemah Melayu ke pasar Melayu: rakyat memang tak suka awak pun", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=50, 
    top_p=0.95, 
                         temperature=0.7,
    num_return_sequences=3)
print(tokenizer.batch_decode(outputs))

['<pad> rakyat mmg x suka ko pun</s> <pad>', '<pad> org melaka pun tak suka kau</s>', '<pad> rakyat memang x suka kau pun</s> <pad> <pad>']


In [11]:
input_ids = tokenizer.encode("terjemah Melayu ke pasar Melayu: Perbincangan khas itu juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=100, 
    top_p=0.95,
                         temperature=0.7,
    num_return_sequences=3)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Perbincangan khas juga bertujuan untuk Seri Paduka mendapat pandangan Raja2 Melayu bagi membolehkan Baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan Negara serta Rakyat',
 'Perbincangan khas juga bertujuan agar Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara dan rakyat',
 'Perbincangan khas itu juga bertujuan utk baginda mendapat pandangan Raja2 Melayu utk membolehkan baginda membuat keputusan terbaik demi kepentingan dan kesejahteraan negara dan rakyat']

In [12]:
input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=100, 
    top_p=0.95, temperature=0.7,
    num_return_sequences=3)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang software engineer yang sangat mahir dan berpengalaman dengan kepakaran dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi tambahan yang berharga untuk pasukan anda',
 'Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan kepakaran dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi penambahan yang berharga kepada pasukan anda',
 'Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang software engineer yang sangat mahir dan berpengalaman dengan pengkhususan dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi penambahan berharga kepada pasukan anda']

In [13]:
t = 'nous bleh buat asr tak?'
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in t.split()]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['nous', 'can', 'do', 'asr', 'not?']

In [14]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: asr', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> asr</s>


In [15]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo x?', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> Is there a promo?</s>


In [16]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica bleh b', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> mesolitica can b</s>


In [17]:
t = 'bleh buat asr tak'
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in t.split()]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)


['can', 'do', 'asr', 'not']

In [18]:
input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {t}', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> can you make asr or not?</s>


In [19]:
strings = [
    'ak tak paham la',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
    "Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.",
    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',
    'Jadi haram jadah😀😃🤭',
    'nak gi mana tuu',
    'Macam nak ambil half day',
    "Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.",
]
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["I don't understand",
 'Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:',
 "Indeed. This doesn't bother experts, I know too. It's a gesture, stupid.",
 "at 8 o'clock at the KK market it's really crowded, he's good at choosing a place.",
 'So haram jadah',
 'where do you want to go?',
 "It's like taking half a day",
 "Imagine PAKATAN HARAPAN and win pru-14. After that, there are all kinds of back doors. Ismail Sabri went up last. That's why I don't give a fk about politics anymore. I swear it's already up."]

In [20]:
strings = [
    'u ni, talk properly lah',
    "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍",
    'Me after seeing this video: mm dapnya burger benjo extra mayo',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
]
input_ids = [{'input_ids': tokenizer.encode(f'terjemah pasar Melayu ke Melayu: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Ini awak, cakap betul-betul',
 'baru menghadiri majlis perkahwinan sepupu saya. Peliknya dia hanya mengadakan majlis biasa kerana gaya hidupnya kelihatan mewah. kemudian saya mendapat tahu mereka akan pergi pada bulan madu selama 3 minggu. keputusan yang bijak',
 'Saya selepas melihat video ini: burger benjo extra mayo memang sedap',
 'Hai kawan-kawan! Saya perasan semalam & hari ini ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']

In [21]:
model.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased-v2', organization='mesolitica')



CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2/commit/569bdfc042a26d6b4f35b7c0ce6cb977d5a799ac', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='569bdfc042a26d6b4f35b7c0ce6cb977d5a799ac', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased-v2', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2/commit/565abb8debe26ead2a002c5ad8cf37c015f94f42', commit_message='Upload tokenizer', commit_description='', oid='565abb8debe26ead2a002c5ad8cf37c015f94f42', pr_url=None, pr_revision=None, pr_num=None)

In [23]:
_ = model.cuda()

In [26]:
from tqdm import tqdm
import json

filtered_left, filtered_right = [], []

with open('shuffled-test.json') as fopen:
    for l in tqdm(fopen):
        data = json.loads(l)['translation']
        p = data['prefix']
        src = data['src']
        input_ids = [{'input_ids': tokenizer.encode(f'{p}: {src}', return_tensors = 'pt')[0]}]
        padded = tokenizer.pad(input_ids, padding = 'longest')
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        outputs = model.generate(**padded, max_length = 256)
        filtered_left.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
        filtered_right.append(data['tgt'])

5000it [12:17,  6.78it/s]


In [27]:
refs = [filtered_right]
sys = filtered_left

In [28]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [29]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 64.583819005204,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)',
 'bp': 1.0,
 'counts': [90014, 73084, 61157, 51798],
 'totals': [112260, 107260, 102260, 97281],
 'sys_len': 112260,
 'ref_len': 107150,
 'precisions': [80.1835025832888,
  68.13723662129405,
  59.805398005085074,
  53.2457519967928],
 'prec_str': '80.2/68.1/59.8/53.2',
 'ratio': 1.047690153989734}