In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [3]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-*'))
checkpoints

['finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-1150000',
 'finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-1160000',
 'finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-1170000']

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hi guys! I noticed yesterday & today many people got this cookies, right? So today I want to share some post mortem of our first batch:</s>


In [6]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Mesolitica can make asr or not</s>


In [7]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Brother, I want to copy it on fb...hahaha. If I can't ss</s>


In [8]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Brother, I want to copy it on fb...hahaha. If I can't ss</s>


In [9]:
input_ids = tokenizer.encode("terjemah pasar Melayu ke Melayu: Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Memang betul. Ini tidak perlu pakar, saya juga tahu. Ia adalah isyarat, bodoh.</s>


In [10]:
input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang jurutera perisian yang mahir dan berpengalaman yang mempunyai kepakaran dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi tambahan berharga kepada pasukan anda</s>


In [11]:
input_ids = tokenizer.encode("terjemah Inggeris ke pasar Melayu: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Jadi aku hisap mak aku yang aku sayang malam tu juga. Sedangkan aku tengah makan makanan barat tapi.. tak sangka makanan tu sedap sebab aku terfikir nak settlekan benda yang ada dalam otak aku</s>


In [12]:
t = 'bleh buat asr tak'
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in t.split()]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)


['can', 'do it', 'asr', 'no']

In [13]:
input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {t}', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> can you make an asr?</s>


In [14]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: asr', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> asr</s>


In [15]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo x?', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Is there a promo?</s>


In [16]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo?', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> is there a promo?</s>


In [17]:
input_ids = tokenizer.encode("terjemah Melayu ke pasar Melayu: Perbincangan khas itu juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=100, 
    top_p=0.95,
                         temperature=0.7,
    num_return_sequences=3)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Perbincangan khas itu juga bertujuan untuk Seri Paduka mendapat pandangan Raja2 Melayu untuk membolehkan baginda membuat keputusan yg terbaik demi kepentingan dan kesejahteraan negara dan rakyat',
 'Perbincangan khas juga bertujuan untuk Seri Paduka dapat pandangan Raja-Raja Melayu bagi membolehkan baginda membuat keputusan terbaik demi kepentingan dan kesejahteraan negara serta rakyat',
 'Perbincangan khas itu juga bertujuan untuk Seri Paduka meraih pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat']

In [18]:
strings = [
    'ak tak paham la',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
    "Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.",
    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',
    'Jadi haram jadah😀😃🤭',
    'nak gi mana tuu',
    'Macam nak ambil half day',
    "Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.",
]
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["I don't understand",
 'Hi guys! I noticed yesterday & today many people got this cookies, right? So today I want to share some post mortem of our first batch:',
 "That's it. This doesn't need an expert, I know too. It's a gesture, stupid.",
 "at 8 o'clock at the OKAY market, there are many people, he is good at choosing places.",
 'So haram jadah',
 'where do you want to go?',
 "It's like taking half day",
 "Imagine PAKATAN HARAPAN and won pru-14. After that there are all kinds of back doors. Last-last Ismail Sabri went up. That's why I don't give it fk about politics anymore. I swear it's already up."]

In [19]:
strings = [
    'u ni, talk properly lah',
    "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍",
    'Me after seeing this video: mm dapnya burger benjo extra mayo',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
]
input_ids = [{'input_ids': tokenizer.encode(f'terjemah pasar Melayu ke Melayu: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['awak, cakap betul-betul',
 'baru sahaja menghadiri perkahwinan sepupu saya. Pelik juga dia buat majlis biasa sebab gaya hidup dia nampak mewah. kemudian saya mendapat tahu mereka sedang berbulan madu selama 3 minggu. keputusan yang bijak',
 'Saya selepas melihat video ini: memang sedap burger benjo extra mayo',
 'Hai kawan-kawan! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']

In [20]:
model.push_to_hub('finetune-noisy-translation-t5-tiny-bahasa-cased-v2', organization='mesolitica')

Upload file pytorch_model.bin:   0%|          | 4.00k/133M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2
   3fd5a71..d0f6650  main -> main



'https://huggingface.co/mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2/commit/d0f6650575b99ba9241a410eb6abad68946f4cb5'

In [21]:
tokenizer.push_to_hub('finetune-noisy-translation-t5-tiny-bahasa-cased-v2', organization='mesolitica')

In [22]:
_ = model.cuda()

In [24]:
from tqdm import tqdm
import json

filtered_left, filtered_right = [], []

with open('shuffled-test.json') as fopen:
    for l in tqdm(fopen):
        data = json.loads(l)['translation']
        p = data['prefix']
        src = data['src']
        input_ids = [{'input_ids': tokenizer.encode(f'{p}: {src}', return_tensors = 'pt')[0]}]
        padded = tokenizer.pad(input_ids, padding = 'longest')
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        outputs = model.generate(**padded, max_length = 256)
        filtered_left.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
        filtered_right.append(data['tgt'])

5000it [09:26,  8.83it/s]


In [25]:
refs = [filtered_right]
sys = filtered_left

In [26]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [28]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 60.0009672168891,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)',
 'bp': 1.0,
 'counts': [86448, 67686, 55157, 45770],
 'totals': [110970, 105970, 100970, 95989],
 'sys_len': 110970,
 'ref_len': 107150,
 'precisions': [77.90213571235469,
  63.87279418703407,
  54.62711696543528,
  47.68254695850566],
 'prec_str': '77.9/63.9/54.6/47.7',
 'ratio': 1.035650956602893}