In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import json
from tqdm import tqdm

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
# count = []
# with open('shuffled-train.json') as fopen:
#     for l in tqdm(fopen):
#         data = json.loads(l)
#         src = data['translation']['src']
#         tgt = data['translation']['tgt']
#         if 'promo' in src or 'promo' in tgt:
#             count.append(data)
            
# count

In [5]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased-combined/checkpoint-*'))
checkpoints

['finetune-t5-small-standard-bahasa-cased-combined/checkpoint-680000',
 'finetune-t5-small-standard-bahasa-cased-combined/checkpoint-690000',
 'finetune-t5-small-standard-bahasa-cased-combined/checkpoint-700000']

In [6]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [7]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:</s>


In [8]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> mesolitica can make asr or not</s>


In [9]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Brother, I want to copy it on Facebook...hahaha. If not, I'll ss</s>


In [10]:
input_ids = tokenizer.encode("terjemah pasar Melayu ke Melayu: Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Memang betul. Ini tidak perlu menjadi pakar, saya juga tahu. Ia adalah isyarat, bodoh.</s>


In [11]:
input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan pengkhususan dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan berharga kepada pasukan anda</s>


In [12]:
input_ids = tokenizer.encode("terjemah Inggeris ke pasar Melayu: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Jadi aku wasap mak aku yang aku sayang malam tu juga. Sambil tu aku tengah makan western food tu tapi.. aku tak sangka makanan tu sedap sebab aku tengah fikir nak settlekan benda yang ada dalam otak aku ni</s>


In [13]:
t = 'nous bleh buat asr tak?'
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in t.split()]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['nous', 'can', 'do', 'asr', 'not?']

In [14]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: asr', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))


<pad> asr</s>


In [15]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo x?', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Is there a promo?</s>


In [16]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo?', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> is there a promo?</s>


In [17]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica bleh b', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> mesolitica can b</s>


In [18]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: got promo?', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> got a promo?</s>


In [19]:
input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=100, 
    top_p=0.95, temperature=0.7,
    num_return_sequences=3)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Saya sedang menulis untuk memohon jawatan Senior Software Engineer di [ Kompany]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan ada pengkhususan dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan berharga buat pasukan anda',
 'Saya sedang menulis untuk memohon jawatan Jurutera Software Kanan di [ Kompany ]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan khasisasi dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan yang berharga kepada pasukan anda',
 'Saya menulis untuk memohon jawatan Senior Software Engineer di [ Syarikat ]. Sebagai seorang jurutera perisian yang berkemahiran tinggi dan berpengalaman dengan khasisasi dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi penambahan yang berharga kepada pasukan anda']

In [20]:
input_ids = tokenizer.encode("terjemah Melayu ke pasar Melayu: rakyat memang tak suka awak pun", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=50, 
    top_p=0.95, 
                         temperature=0.7,
    num_return_sequences=3)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Rakyat mmg tak suka kau jugak',
 'rakyat x suka ko pun',
 'Rakyat mmg tak suka kau pon']

In [21]:
input_ids = tokenizer.encode("terjemah Melayu ke pasar Melayu: Perbincangan khas itu juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat", return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, do_sample=True, 
    top_k=100, 
    top_p=0.95,
                         temperature=0.7,
    num_return_sequences=3)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Perbincangan khas itu juga bertujuan utk Seri Paduka Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan terbaik demi dan kesejahteraan negara serta rakyat',
 'Perbincangan khas itu juga bertujuan untuk Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan yang terbaik demi dan kesejahteraan negara dan rakyat',
 'Perbincangan khas itu juga bertujuan agar Baginda mendapat pandangan Raja2 Melayu bagi membolehkan Baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat']

In [22]:
t = 'bleh buat asr tak'
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in t.split()]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 50)
tokenizer.batch_decode(outputs, skip_special_tokens=True)


['can', 'do', 'asr', 'no']

In [23]:
input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {t}', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> can you do asr or not?</s>


In [24]:
strings = [
    'ak tak paham la',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
    "Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.",
    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',
    'Jadi haram jadah😀😃🤭',
    'nak gi mana tuu',
    'Macam nak ambil half day',
    "Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.",
]
input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["I don't understand",
 'Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:',
 "Indeed. This doesn't need to be an expert, I know too. It's a gesture, stupid.",
 "at 8 o'clock at the OKAY market, it's really crowded, he's good at choosing a place.",
 "So it's illegal",
 'where do you want to go?',
 "It's like taking half a day",
 "Imagine Pakatan Harapan and win pru-14. After that there are all kinds of back doors. Last-last Ismail Sabri went up. That's why I pray not to give a fk about politics anymore. I swear it's up."]

In [25]:
strings = [
    'u ni, talk properly lah',
    "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍",
    'Me after seeing this video: mm dapnya burger benjo extra mayo',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
]
input_ids = [{'input_ids': tokenizer.encode(f'terjemah pasar Melayu ke Melayu: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length = 100)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['ini awak, bercakap dengan betul',
 'baru menghadiri majlis perkahwinan sepupu saya. Pelik juga dia buat majlis biasa-biasa sebab gaya hidup dia nampak mewah. kemudian saya mendapat tahu mereka akan berbulan madu selama 3 minggu. keputusan pintar',
 'Saya selepas melihat video ini: memang sedap burger benjo extra mayo',
 'Hai semua! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']

In [None]:
model.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased-v4', organization='mesolitica')

In [None]:
tokenizer.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased-v4', organization='mesolitica')

In [26]:
_ = model.cuda()

In [32]:
filtered_left, filtered_right = [], []

with open('shuffled-test.json') as fopen:
    for l in tqdm(fopen):
        data = json.loads(l)['translation']
        p = data['prefix']
        src = data['src']
        input_ids = [{'input_ids': tokenizer.encode(f'{p}: {src}', return_tensors = 'pt')[0]}]
        padded = tokenizer.pad(input_ids, padding = 'longest')
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        outputs = model.generate(**padded, max_length = 256)
        filtered_left.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
        filtered_right.append(data['tgt'])

5000it [07:05, 11.76it/s]


In [33]:
refs = [filtered_right]
sys = filtered_left

In [34]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [35]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 64.06258219941243,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)',
 'bp': 1.0,
 'counts': [89388, 72166, 60115, 50792],
 'totals': [111635, 106635, 101635, 96656],
 'sys_len': 111635,
 'ref_len': 107150,
 'precisions': [80.07166211313655,
  67.67571622825527,
  59.14793132287106,
  52.549246813441485],
 'prec_str': '80.1/67.7/59.1/52.5',
 'ratio': 1.0418572095193654}