In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os 


%matplotlib inline

## Make corpus

In [2]:
def istrip(text, pattern):
    while len(text)>0:
        if text[0] in pattern:
            text = text[1:]
        else:
            break
    while len(text)>0:
        if text[-1] in pattern:
            text = text[:-1]
        else:
            break
    return text

In [3]:
from nltk.translate import AlignedSent
from nltk.lm.preprocessing import pad_both_ends
def Make_bitext(path, reversed = False):
    bitext = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = filter(None, (line.rstrip() for line in f))
        for line in lines:
            line = line.strip()
            lists = line.split('\t')
            source_text, target_text = lists[0], lists[1]
            source_sentence = source_text.strip().split()
            source_sentence = [istrip(source_text, ' .,?!_').lower() for source_text in source_sentence]
            #source_sentence = list(pad_both_ends(source_sentence, n=2))
            target_sentence = target_text.strip().split()
            target_sentence = [istrip(target_text, ' .,?!_').lower() for target_text in target_sentence]
            #target_sentence = list(pad_both_ends(target_sentence, n=2))
            #print(source_sentence, target_sentence)
            alignt_sen = AlignedSent(target_sentence, source_sentence ) if reversed else AlignedSent(source_sentence, target_sentence)
            bitext.append(alignt_sen)
    return bitext

In [4]:
from nltk.translate import IBMModel1, IBMModel2, IBMModel3, PhraseTable
from nltk.lm.models import Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize
from dataclasses import dataclass
@dataclass
class Translate_result:
    """_summary__

    result of translte function, include:
    - text: translated text:
    - score: score for each word in list
    - src: source langue code
    - des: destinate langue code
    """
    text: str
    result: list
    score: list
    """_summary_

    - score list for each word
    - each element present for (total_score, aligment_score, n_gram_model_score)
    """
    src: str
    des: str

model_dic = {'ibm1':IBMModel1, 'ibm2':IBMModel2, 'ibm3': IBMModel3}

class Translate_model:
    """_summary_
    An translate model make by alignment model and n_grams model
    """
    def __init__(self, align_model = 'ibm2', n_order = 2):
        """## _summary_

        ### init for translate model
        An translate model mix by alignment model and n_grams model
        ## Args:
            - align_model (str, optional): align model to align word must in ['ibm1', 'ibm2', 'ibm3']. Defaults to 'ibm2'.
            
            - n_order (int, optional): n_order for n_grams model. Defaults to 2.

        ## Raises:
            - Exception: when pass wrong model or n_order < 1
        """
        if align_model in model_dic:
            self.align_model_base = model_dic[align_model]
        else: 
            raise Exception(f"your parameter '{align_model}' is not allow, it must be in ['ibm1', 'ibm2', 'ibm3']")
        if n_order > 0:
            self.n_order = n_order
        else:
            raise Exception(f"n_order must greater than 0")
        self.ngrams_model = Laplace(n_order)
        self.ngrams_model2 = Laplace(n_order+1)

    def fit(self, bitext, src = 'en', des = 'vi', epochs = 10):
        """## _summary_
        
        fit model
        ### Args:
            - bitext (list): list of AlignedSent
            - src (str, optional): source language code. Defaults to 'en'.
            - des (str, optional): destinate language code. Defaults to 'vi'.
            - epochs (int, optional): number of iterator to train model. Defaults to 10.
        """
        self.align_model = self.align_model_base(bitext, epochs)
        self.src = src
        self.des = des
        target_list = [list(pad_both_ends(align_sent.mots, n=2)) for align_sent in bitext]
        train_data, vocab = padded_everygram_pipeline(self.n_order, target_list)
        self.ngrams_model.fit(train_data, vocab)
        self.ngrams_model2.fit(train_data, vocab)

        self.src_voca = set()
        self.des_voca = set()
        for align_sent in bitext:
            for src_word in align_sent.words:
                self.src_voca.add(src_word)
            for des_word in align_sent.mots:
                self.des_voca.add(des_word)

    def text_preprocessing(self, text:str):
        list_word = [istrip(word, ' .,?!_').lower() for word in text.strip().split() ]
        return list_word
    def Rearrangement(self, list_text):
        preList = ['<s>']
        #list_text = list_text + ['</s>']
        n = len(list_text)#-1
        for i in range(n):
            best_score = -10
            best_word = ''
            for word in list_text:
                try:
                    score1 = self.ngrams_model.score(word, preList)
                except:
                    score1 = 0
                try:
                    score2 = self.ngrams_model2.score(word, preList)
                except:
                    score2 = 0
                score = score1 + score2
                if score>best_score:
                    best_word = word
                    best_score = score
            preList.append(best_word)
            list_text.remove(best_word)
        if (preList[-1] == '</s>'):
            preList = preList[:-1]
        return preList[1:]
    def Translate(self, source_sentence:str, alpha = 0.5, n_gram_intensity = 10, rearrange = False) -> Translate_result:
        """## _summary_
        
        Make translate for sentence.
        ## Args:
            - source_sentence (str): _description_
            - alpha (float, optional): weight for alignment_model it must in range [0,1], if alpha = 1 mean that we ignore n_grams model, othewise.. Defaults to 0.5.
            - n_gram_intensity (int, optional): multiply intensity for n_grams model, because probabilty of this kind model is smaller than alignment score . Defaults to 2.

        ## Returns:
           - Translate_result: Translate_result data class. Use <text> propety to get text result, for more information, read Translate_result class summary.
        """
        if alpha > 1 or alpha < 0:
            raise Exception(f"alpha must in range [0,1]")
    
        target_sentences = ['<s>']
        score_list = []
        source_sentence_list = self.text_preprocessing(source_sentence)
        for source_word in source_sentence_list:
            if source_word not in self.src_voca:
                target_sentences.append(source_word)
                score_list.append(-1)
                continue
            max_score = -1
            best_translation = ''
            pair_score = None
            for target_word in self.des_voca:
                alignment_score = self.align_model.translation_table[source_word][target_word]

                context = target_sentences
                try:
                    n_gram_score_prob = self.ngrams_model.score(target_word, context=context)
                except:
                    n_gram_score_prob = 0
                #print(f"{source_word} translate to {target_word} with {alignment_score} and {n_gram_score_prob}")
                # phrase_table_score = self.__phrase_table.score(source_word, target_word)
                total_score = alpha*alignment_score + alpha*n_gram_score_prob*n_gram_intensity*self.n_order
                if total_score > max_score:
                    max_score = total_score
                    best_translation = target_word
                    pair_score = (target_word, total_score, alignment_score, n_gram_score_prob)
            target_sentences.append(best_translation)
            score_list.append(pair_score)
        target_sentences = target_sentences[1:]
        if rearrange:
            target_sentences = self.Rearrangement(target_sentences)
        list_nor = [' '.join(word.split('_')) if '_' in word else word for word in target_sentences]
        translate_result = Translate_result(' '.join(list_nor),target_sentences ,score_list, self.src, self.des)

        return translate_result
 

In [51]:
bitex = Make_bitext('data.txt')
translate_model = Translate_model('ibm2', 3)

In [52]:
translate_model.fit(bitext=bitex, epochs = 30)

In [53]:
result = translate_model.Translate('she love him, but i hate him', 0.5, 2, rearrange=False)

In [54]:
print(result.text)
print(result.score)

cô ấy tình yêu anh ta nhưng tôi ghét anh ta
[('cô_ấy', 0.33921620514946876, 0.6447192840307933, 0.005618854378024036), ('tình_yêu', 0.4394595035917922, 0.8749999999987379, 0.0006531678641410843), ('anh_ta', 0.24338121884206093, 0.48275174249695607, 0.0006684491978609625), ('nhưng', 0.5020053475920248, 0.9999999999968838, 0.0006684491978609625), ('tôi', 0.4919035322429771, 0.9797963692987884, 0.0006684491978609625), ('ghét', 0.33054667998666704, 0.6570826647861683, 0.0006684491978609625), ('anh_ta', 0.24338121884206093, 0.48275174249695607, 0.0006684491978609625)]


In [59]:
res = translate_model.Translate('i meet him at this morning', alpha=0.4, n_gram_intensity=2, rearrange=True)

In [60]:
res.text

'tôi gặp anh ta tại này sáng'

In [67]:
translate_model.Translate('look at him', alpha=0.4, n_gram_intensity=2, rearrange=False).text

'trông tại anh ta'

In [72]:
translate_model.Translate('who she like', alpha=0.4, n_gram_intensity=2, rearrange=True).text

'cô ấy ai như'

In [78]:
translate_model.Translate( 'do you hate him?', alpha=0.4, n_gram_intensity=2, rearrange=True).text

'bạn làm ghét anh ta'

In [82]:
translate_model.Translate( 'they do not know', alpha=0.4, n_gram_intensity=2, rearrange=True).text

'họ làm không phải biết'

# Evaluation

In [58]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\drawt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score
def meteor_score_wrapper(ref, hyp):
    return meteor_score([ref], hyp)
def sentence_gleu_wrapper(ref, hyp):
    return sentence_gleu([ref], hyp)
def evaluate(model, wrapper = sentence_gleu_wrapper, start = 0, take=100,):
    score = 0
    for i in range(start, take):
        ref = bitex[i].mots
        hyp = model.Translate(' '.join(bitex[i].words), alpha=0.4, n_gram_intensity=2, rearrange=True)
        score_i = wrapper(ref, hyp.result)
        #print(f"source: {bitex[i].words} \t ref: {' '.join(ref)} \t hyp: {hyp.text} | \t meteor score is {score_i}" )

        score += score_i
    return score/take

In [60]:
evaluate(translate_model,sentence_gleu_wrapper, 0, 100)

0.214061286324407

In [61]:
evaluate(translate_model,meteor_score_wrapper, 0, 100)

0.36281934111852654

In [62]:
translate_model3 = Translate_model('ibm2', 3)
translate_model3.fit(bitext=bitex, epochs = 20)

In [63]:
res = translate_model3.Translate('why do she like him', alpha=0.4, n_gram_intensity=2, rearrange=True)

In [64]:
res.text

'cô ấy thích tại sao tờ anh ta'

In [65]:
evaluate(translate_model3,sentence_gleu_wrapper, 0, 100)

0.23112699839011908

In [66]:
evaluate(translate_model3,meteor_score_wrapper, 0, 100)

0.3903746095773352

In [67]:
translate_model3.Translate('Not sure if you know this', alpha=0.4, n_gram_intensity=2, rearrange=False).text

'không phải chắc chắn nếu bạn biết này'