In [1]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
import json 
import os 
import pandas as pd


def load_language(name:str):
    '''return a languages src and target from the jsons located in /sample/'''
    
    data = pd.read_json(f'../sample/{name}.jsonl', lines=True)

    return list(zip(data['source'].tolist(), data['target'].tolist()))


a = load_language('fr_FR')
a


[('Who are the main characters in the movie Little Women?',
  ['Qui sont les personnages principaux du film Les Quatre Filles du docteur March?',
   'Qui sont les personnages principaux du film Les Quatre Filles du docteur March??']),
 ('Who are the main actors in the movie Miracle in Cell No. 7?',
  ['Qui sont les acteurs principaux du film 7. Koğuştaki Mucize?']),
 ('How can Welsh onions be grown and harvested in home gardens?',
  ['Comment la ciboule peut-elle être cultivé et récolté dans son jardin?',
   'Comment peut-on cultiver et récolter la ciboule dans son potager?',
   'Comment peut-on cultiver et récolter de la ciboule dans son jardin?']),
 ('What is the genre of A City of Sadness?',
  ['À quel genre appartient La Cité des douleurs?',
   'Quel est le genre du film La Cité des douleurs?',
   'Quel est le genre de La Cité des douleurs?']),
 ('How would you describe The Princess and the Pea in one word?',
  ['Comment peut-on décrire La Princesse au petit pois en un seul mot?',


In [3]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")



In [15]:
article = "sup my dawg"

tokenizer.src_lang = "en_XX"
encoded_hi = tokenizer(a[2][0], return_tensors="pt")
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]
)
o = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)



['Comment peut-on culturer et récolter des oignons gallois dans les jardins domestiques?']

In [19]:
'''Load the lookup table to get our langs into the format used by the model'''
with open('../data/lookup_table.json', 'r') as f:
    lookup = json.loads(f.read())
    lookup = lookup['lookup']

lookup

{'en_EN': 'en_XX',
 'fr_FR': 'fr_XX',
 'ar_AE': 'ar_AR',
 'de_DE': 'de_DE',
 'es_ES': 'es_XX',
 'it_IT': 'it_IT',
 'ja_JP': 'ja_XX',
 'ko_KR': 'ko_KR',
 'th_TH': 'th_TH',
 'tr_TR': 'tr_TR',
 'zh_TW': 'zh_CN'}

In [26]:
from nltk.translate.bleu_score import sentence_bleu #using nltk bleu score

In [33]:
hyp = ['the', 'dog', 'crossed', 'the', 'road']
ref = [['the', 'dog', 'crossed', 'the', 'road']] #simple example (reference must be list of list, hypothesis is a single list of tokens)

In [34]:
sentence_bleu(ref, hyp)

1.0

In [35]:
# load data
# per sample 
# translate into target language
# compute bleu score 
# generate avg scores per language 
# generate report 


In [37]:
def translate(lang:str, utterance:str):
    '''Using the huggingface translator, translate the utterance from english to the target language'''
    lang = lookup[lang]
    tokenizer.src_lang = "en_XX"
    encoded_hi = tokenizer(utterance, return_tensors="pt")
    generated_tokens = model.generate(
        **encoded_hi,
        forced_bos_token_id=tokenizer.lang_code_to_id[lang]
    )
    
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

translate("fr_FR", a[1][0])

['Qui sont les principaux acteurs du film Miracle in Cell No. 7 ?']

In [66]:
from tqdm import tqdm

In [70]:
def rough_token(lang: str, utterance:str):
    if lang == "zh_TW":
        return list(utterance) #return the character level tokeniztion of the utterance
    else:
        return utterance.split(" ") #return word level tokenization of utterance


def compute_score(lang:str, reference:str, hypothesis:str): #calculate bleu score
    refs = []
    for ref in reference:
        refs.append(rough_token(lang, ref))
    
    hyp = rough_token(lang, hypothesis)

    return sentence_bleu(refs, hyp)

def test(lang):
    '''Take in a target language
        load data
        perform translations
        calculate bleu score
    '''
    scores = []
    results = {}
    try:
        samples = load_language(lang)
        for sample in tqdm(samples, desc=f'Translating to {lang}'):
            source, targets = sample
            hypothesis = translate(lang, source)
            score = compute_score(lang, targets, hypothesis[0])

            results[source] = {}
            results[source]['translation'] = hypothesis
            results[source]['score'] = score

            scores.append(score)

        return (results, scores, sum(scores)/len(scores))

    except Exception as e:
        print(f'Encountered {e} while translating {lang}')



    

In [71]:
test('de_DE') #test on german

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Translating to de_DE: 100%|██████████| 20/20 [00:51<00:00,  2.59s/it]


({'How tall is Saint Sophia Cathedral in Kyiv?': {'translation': ['Wie hoch ist die Kathedrale Saint Sophia in Kiew?'],
   'score': 9.106239987484608e-155},
  'How long was Mary of Burgundy married to Emperor Maximilian I?': {'translation': ['Wie lange war Maria von Burgund mit Kaiser Maximilian I. verheiratet?'],
   'score': 1.0},
  'What is the genre of The War of the Worlds?': {'translation': ['Was ist das Genre des Krieges der Welten?'],
   'score': 0.4111336169005197},
  'Where is Viktor Yushchenko from?': {'translation': ['Woher kommt Viktor Yuschenko?'],
   'score': 9.53091075863908e-155},
  'What was the original purpose of the Cathedral of Christ the Saviour?': {'translation': ['Was war der ursprüngliche Zweck der Kathedrale Christi der Erlöser?'],
   'score': 0.5169731539571706},
  'What is the genre of A City of Sadness?': {'translation': ['Was ist das Genre von A City of Sadness?'],
   'score': 0.44632361378533286},
  'How can Welsh onions be grown and harvested in home gar