# Machine Translation using Noisy Channel

Here we will attempt to translate old Irish to new Irish using Noisy Channel.


*   We ingest our data and concatenate the words into sentences
*   We calculate frequencies of each source word per sentence and repeat for the target words
*   We also compute frequencies of a target word being matched by the adjacent word in the target sentence for all occurences of the source word
*   Use Bayesian probabilities to predict the next word when translating test data: 
  **P(new|old) = P(new)P(old|new)**

* We process stop words, auxililary verbs and punctuations etc separately as they are more frequent than most words and thus dominate probability calculations

  **Limitations**

- Cuts off rest of target sentence once end of source sentence reached

## Read Files

In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np


path_train_source = r"/content/train-source.txt"
path_train_target = r"/content/train-target.txt"

In [10]:
def read_file(path):

  text = []
  words = set()

  sentences = open(path, 'r', encoding='utf-8')

  sent = ""
  
  for ind, word in enumerate(sentences):

    token = word.rstrip("\n").rstrip("\t").lower()

    if token == "<s>" or token in ["'", '"']:
      # sent = sent
      continue

    elif token == '</s>':
      sent = sent[:-1]
      text.append(sent.lstrip())
      sent = ""

    else:
      sent = sent + token + " "    
  
    if token not in words:
      words.add(token)

  word_index = dict([(i, word) for i, word in enumerate(words)])

  return text, words, word_index

In [130]:
from difflib import SequenceMatcher as SM

def compute_frequencies(source, target):

  target_frequencies = {}
  source_to_target_frequencies = {}
  total_st_frequencies = 0

  for x in zip(source, target):

    source_words = [word for word in x[0].split(" ")]
    target_words = [word for word in x[1].split(" ")]

    # print(len(source_words), len(target_words))

    for ind, word in enumerate(source_words):

      if ind <= len(target_words) - 1:

        t_w = target_words[ind]
        # print(t_w)

        # compare similarity of the two words
        if not (SM(None, word, t_w).ratio() > 0.5 and len(min(word,t_w))/len(max(word,t_w)) > 0.5):
          t_w = compare_words(word, t_w, target_words[ind:])
          # print(t_w)
          # break

        # Add target to emission table
        # print(target_frequencies.keys())
        if t_w in target_frequencies.keys():
          target_frequencies[t_w] += 1
        else:
          target_frequencies[t_w] = 1

        # print(target_frequencies)

        # Add target to transition table for source words
        if word in source_to_target_frequencies.keys():
          if t_w in source_to_target_frequencies[word].keys():
            source_to_target_frequencies[word][t_w] += 1
          else:
            source_to_target_frequencies[word][t_w] = 1

        else:
          source_to_target_frequencies[word] ={}          
          source_to_target_frequencies[word][t_w] = 1

        total_st_frequencies += 1

  return target_frequencies, source_to_target_frequencies, total_st_frequencies

In [141]:
def compare_words(source, target, target_list, ind = -1):

  ind += 1

  if ind > len(target_list)-1:
    return "<PAD>"

  else:

    if SM(None, source, target).ratio() > 0.5 and len(min(source, target))/len(max(source, target)) > 0.5:
      return target

    else:
      target = target_list[ind]
      return compare_words(source, target, target_list, ind)


In [102]:
input_text, input_words, input_word_index = read_file(path_train_source)
target_text, target_words, target_word_index = read_file(path_train_target)

input_words.add("<PAD>")
target_words.add("<PAD>")

input_word_index[len(input_word_index)] = "<PAD>"
target_word_index[len(target_word_index)] = "<PAD>"


In [101]:
target_word_index[len(target_word_index)-1]

'thíre'

In [131]:
target_frequencies, source_to_target_frequencies, total_st_frequencies = compute_frequencies(input_text, target_text)

## misc

In [75]:
source_to_target_frequencies

{'cinnte': {'cinnte': 217, "'cinnte": 1, 'linn': 1},
 'go': {'go': 7889, 'gaoth': 1, 'gceo': 1, "'go": 20, 'giota': 1},
 'leór': {'leor': 151, 'mór': 1},
 ',': {',': 17002},
 'thiocfadh': {'thiocfadh': 324, 'taobhadh': 1, "'thiocfadh": 1},
 'dóbhtha': {'dóibh': 33, 'dófa': 11},
 'bás': {'bás': 172, 'bhás': 1},
 'a': {'a': 23483, "'a": 37},
 'fhagháil': {'fháil': 175, 'fhágáil': 1, 'fáil': 1},
 'ar': {'ar': 12978,
  'aird': 6,
  'air': 14,
  'airde': 6,
  'mar': 5,
  'abar': 1,
  'ard': 1,
  'arb': 1,
  'dar': 2,
  'art': 1,
  "'ar": 9},
 'imeall': {'imeall': 28},
 'an': {'an': 22152,
  'san': 50,
  'aon': 12,
  'ann': 19,
  'achan': 1,
  'éan': 3,
  'ainm': 4,
  "'an": 38,
  'gan': 9},
 'phuill': {'phoill': 6, 'pholl': 1},
 '.': {'.': 22853},
 'bhruach': {'bhruach': 46, 'bhearnach': 1},
 'agus': {'agus': 15390,
  'as': 10,
  'acu': 12,
  'ais': 21,
  'gur': 36,
  'anuas': 3,
  'agatsa': 2,
  'las': 1,
  'abhus': 4,
  'agamsa': 2,
  "'agus": 44},
 'na': {'na': 7089,
  'ina': 741,
  'fán

In [14]:
input_text[0:10]

['cinnte go leór , thiocfadh dóbhtha bás a fhagháil ar imeall an phuill udaí .',
 '( bhí sé follasach go rabh an poll sin ag foscladh ar an fhairrge ar dhóigh éigin , ná líonadh agus thráigheadh an t-uisce ann .',
 ") d'fhéadfadh siad bás fhagháil ar a bhruach agus na cuirp imtheacht ar an lán mhara amach fríd an phluais .",
 'thiocfadh dóbhtha fosta lámh a chur ina mbás féin , a ghabháil de léim isteach sa pholl ghalach a bhí i n-iarthar an dara taibhlidh - poll mar bhéadh coire de uisce ghalach ann .',
 "na dhiaidh sin , bhí rud éigin do-chreidte agus leamh in gach teóir de'n bheirt .",
 'nuair a scríobh siad an litir sin bhí siad araon i sláinte mhaith agus tréan bidh aca .',
 'bhí fhios againn i gceart nach rabh díothbhail ar bith ortha , na diomaoite de chuid bonnóg chathail tháinig muid ar an áit taiscthe a bhí aca - scealpach bheag eachar dhá charraic - agus arán agus spólaí de chaoir-fheóil shaillte ann .',
 "b'fhuras a fheiceáil cá dtáinig an chaoir-fheóil .",
 'ar urlár na h-

In [70]:
combine = list(zip(input_text, target_text))
unmatch = []
s = []
t = []
for i in range(len(combine)):
  if len(combine[i][0].split()) != len(combine[i][1].split()) and len(unmatch) < 100:
    unmatch.append(combine[i])
    s.append(combine[i][0])
    t.append(combine[i][1])

# unmatch


In [49]:
t_f, s_t_f = compute_frequencies(s, t)
# s

In [50]:
s_t_f

{'fhagháil-fháil': 1,
 'ar-ar': 30,
 'a-a': 49,
 'bhruach-bhruach': 2,
 'agus-agus': 30,
 'na-na': 11,
 'cuirp-coirp': 1,
 'na-ina': 2,
 'dhiaidh-dhiaidh': 1,
 'sin-sin': 6,
 'bhí-bhí': 33,
 'go-go': 22,
 'deimhin-deimhin': 1,
 ',-,': 32,
 'is-is': 3,
 'mó-mó': 2,
 'mhéaduigh-mhéadaigh': 1,
 'muid-muid': 1,
 'í-í': 1,
 'ann-an': 2,
 'súil-súil': 1,
 'agam-agam': 5,
 'mbéadh-mbeadh': 2,
 'siad-siad': 1,
 'shiubhal-shiúl': 1,
 'sul-sula': 1,
 'thug-thug': 1,
 'mé-mé': 44,
 'freeman-freeman': 4,
 'fríd-fríd': 3,
 'an-an': 38,
 'uaimh-uaimh': 1,
 'uilig-uilig': 1,
 'thaisbeáin-thaispeáin': 1,
 "acha'n-gach": 2,
 "b'ion-b'iontach": 1,
 'caidé-cad': 5,
 'tá-tá': 5,
 'lúthgháir-lúcháir': 1,
 'orm-orm': 3,
 'gur-gur': 6,
 'fhág-fhág': 1,
 'tú-tú': 1,
 'neithe-nithe': 1,
 'baineadh-baineadh': 1,
 'stangadh-stangadh': 1,
 'asam-asam': 1,
 'sé-sé': 13,
 'sheasamh-sheasamh': 1,
 'san-san': 2,
 'áit-áit': 10,
 'chéadna-chéanna': 3,
 'ag-ag': 15,
 'amharc-amharc': 1,
 'spaisteóracht-spaisteoireacht'

In [212]:
target_text[0:10]

['cinnte go leor , thiocfadh dóibh bás a fháil ar imeall an phoill úd .',
 'bhí sé follasach go raibh an poll sin ag foscladh ar an fharraige ar dhóigh éigin , nó líonadh agus thráigheadh an t-uisce ann .',
 "d'fhéadfadh siad bás a fháil ar a bhruach agus na coirp a imeacht ar an lán mhara amach fríd an phluais .",
 "thiocfadh dóibh fosta lámh a chur ina mbás féin , a ghabháil de léim isteach sa pholl ghalach a bhí in iarthar an dara táibhle - poll mar a bheadh coire d'uisce ghalach ann .",
 'ina dhiaidh sin bhí rud éigin dochreidte agus leamh i ngach teoiric den bheirt .',
 'nuair a scríobh siad an litir sin bhí siad araon i sláinte mhaith agus tréan bídh acu .',
 'bhí a fhios againn i gceart nach raibh díobháil ar bith orthu , nó diomaite de chuid bonnóg chathail tháinig muid ar an áit taiscthe a bhí acu - scealpach bheag eadar dhá charraig - agus arán agus spólaí de chaoireoil shaillte ann .',
 "b'fhurasta a fheiceáil cá dtáinig an chaoireoil .",
 'ar urlár na huaimhe bhí cnámha cao

In [52]:
source_to_target_frequencies

{'cinnte-cinnte': 217,
 'go-go': 7889,
 'leór-leor': 151,
 ',-,': 17002,
 'thiocfadh-thiocfadh': 324,
 'dóbhtha-dóibh': 33,
 'bás-bás': 172,
 'a-a': 23483,
 'fhagháil-fháil': 175,
 'ar-ar': 12978,
 'imeall-imeall': 28,
 'an-an': 22152,
 'phuill-phoill': 6,
 '.-.': 22853,
 'bhruach-bhruach': 46,
 'agus-agus': 15390,
 'na-na': 7089,
 'cuirp-coirp': 11,
 'fosta-fosta': 253,
 'lámh-lámh': 256,
 'chur-chur': 474,
 'ina-ina': 795,
 'mbás-mbás': 5,
 'féin-féin': 2415,
 'ghabháil-ghabháil': 85,
 'de-de': 2165,
 'léim-léim': 137,
 'isteach-isteach': 888,
 'sa-sa': 892,
 'pholl-pholl': 38,
 'ghalach-ghalach': 2,
 'bhí-bhí': 10906,
 'n-iarthar-iarthar': 4,
 'dara-dara': 225,
 'taibhlidh-táibhle': 2,
 '---': 573,
 'poll-poll': 47,
 'mar-mar': 1622,
 "uisce-d'uisce": 1,
 'ann-ann': 1800,
 'na-ina': 741,
 'dhiaidh-dhiaidh': 491,
 'sin-sin': 4001,
 'nuair-nuair': 2046,
 'scríobh-scríobh': 25,
 'siad-siad': 2378,
 'litir-litir': 36,
 'araon-araon': 31,
 'i-i': 4132,
 'sláinte-sláinte': 21,
 'mhaith-mh

## Noisy Channel Model



In [None]:
%%shell
pip install advertools

In [169]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

def evaluate(target_words, target_word_index, target_frequencies, source_to_target_frequencies,total_st_frequencies):

  testsource = open('test-source.txt', 'r')
  original = []
  sentence = []
  from_scratch_hypotheses = []
  

  for line in testsource:
    token = line.rstrip("\n").lower()

    if token=='<s>':

      sentence = []

    elif token == '</s>':
      from_scratch_hypotheses.append(predict(sentence,target_words, target_word_index, target_frequencies, source_to_target_frequencies,total_st_frequencies))
      original.append(sentence)
    else:
      sentence.append(token)

    if len(from_scratch_hypotheses) == 100: 
      break

  references = []

  testtarget = open('test-target.txt', 'r')

  x = 0
  # while x > 
  for line in testtarget:
    token = line.rstrip("\n").lower()
    if token=='<s>':
      sentence = []
    elif token == '</s>':
      references.append([sentence])
      # print("Actual",sentence)
      # break
    else: 
      sentence.append(token)
    
    if len(references) == 100: 
      break

  return print((corpus_bleu(references,from_scratch_hypotheses))), from_scratch_hypotheses, references,original

In [172]:
import advertools as adv
def predict(source_sentence, target_words, target_word_index, target_frequencies, source_to_target_frequencies,total_st_frequencies):  

  target = []

  p_s = [",", "!", "-", ".", "?","'", '"',"á", "na", "liom", "leis", "léi", "'un"]
  stop_punct = list(adv.stopwords["irish"])
  stop_punct = stop_punct + p_s

  total_freqs = sum(target_frequencies.values())

  for ind, val in enumerate(source_sentence):

    probs = {}

    print(val)

    try:
      total_tran_freq = sum(source_to_target_frequencies[val].values())      
      for x in source_to_target_frequencies[val].keys():
        
        # print(val)
        # handle punctuation and stop words separately
        # they have highest frequencies hence risk of blowing up on prob scores

        if (x in stop_punct and val not in stop_punct):
          x_prob = 0.0
        else:
          try:
            x_prior = target_frequencies[x]/total_freqs
          except KeyError:
            x_prior = 0.0

          # s_t =  val + "-" + x

          try:
            x_post = source_to_target_frequencies[val][x] / total_tran_freq
          except KeyError:
            x_post = 0.0

          x_prob = x_prior * x_post
  
        probs[x] = x_prob

    except KeyError:
      x_prob = 0.0
      probs.append({x: x_prob})

    # print(probs)
    ind_new_word = probs.index(max(probs))
    new_word = target_word_index[ind_new_word]

    target.append(new_word)

  return target



In [64]:
_,_,_,_ = evaluate(target_words, target_word_index, target_frequencies, source_to_target_frequencies)

0.6938620338480631


In [65]:
_, pred, ref, original = evaluate(target_words, target_word_index, target_frequencies, source_to_target_frequencies)

0.6938620338480631


In [170]:
_, pred_, ref_, original_ = evaluate(target_words, target_word_index, target_frequencies, source_to_target_frequencies, total_st_frequencies)

scéal
chathail
freeman
-
téid
mo
dhearbhráthair
'un
na
dubh-charraice
mí
iúil
a
bhí
ann
i
mbliadhain
a
1854
,
nuair
a
bhain
an
taisme
seo
dúinn
.
an
dearbhráthair
a
ba
sine
agam
,
seán
freeman
,
tugadh
uainn
go
tobann
é
,
agus
a
thásc
nó
a
thuairisc
ní
rabh
againn
le
fagháil
.
tráthnóna
breágh
amháin
,
chuaidh
sé
a
dh'iascaireacht
;
agus
,
ach
oiread
agus
dá
slugadh
an
talamh
é
,
níor
phill
sé
.
bhí
sé
go
maith
i
n-a
shláinte
agus
nuair
a
bhí
muid
ag
ár
ndinneár
bhí
sé
lán
grinn
agus
cuideachta
;
agus
choinnigh
sé
an
chuid
a
b'óige
againn
i
dtrithibh
gáiridhe
go
dtí
go
mb'éigean
do
m'athair
bagar
orainn
agus
cuireadh
fear
de
mo
chuid
dearbhrátharach
amach
as
an
tseomra
.
iomlán
na
rudaí
a
thárla
an
lá
sin
,
tá
siad
comh
soiléir
in
mo
chuimhne
is
dá
mbéinn
ag
amharc
ortha
le
mo
dhá
shúil
.
'tchím
go
fóill
an
caimbéal
agus
an
an-chuma
a
chuireadh
sé
ar
a
aghaidh
nuair
nach
mbíodh
m'athair
ag
amharc
agus
d'fhágthaí
muid-inne
siocthaí
leis
na
gáiridhe
.
bhí
sé
i
ndiaidh
a
theacht
'un
a'
bh

In [140]:
len(target_word_index)

25737

## New Section

In [171]:
ref_ = [x[0] for x in ref]
joined = list(zip(pred_, ref_, original_))
for i in range(len(joined)):
  if i == 1:
    print("Pred","\t", joined[i][0], "\nRef\t", joined[i][1], "\nOr\t", joined[i][2])
    break

Pred 	 ['gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú', 'gearrsmaointiú'] 
Ref	 ['mí', 'iúil', 'a', 'bhí', 'ann', 'i', 'mbliain', 'a', '1854', ',', 'nuair', 'a', 'bhain', 'an', 'taisme', 'seo', 'dúinn', '.'] 
Or	 ['mí', 'iúil', 'a', 'bhí', 'ann', 'i', 'mbliadhain', 'a', '1854', ',', 'nuair', 'a', 'bhain', 'an', 'taisme', 'seo', 'dúinn', '.']


In [161]:
source_to_target_frequencies["se"]

{'<PAD>': 7, 'seo': 1}

In [60]:
for key in source_to_target_frequencies.keys():
  if key[:4] == "n-a-":
    print(key, ": ",source_to_target_frequencies[key])

n-a-lena :  65
n-a-n-ál :  1
n-a-fána :  19
n-a-ina :  166
n-a-arna :  5
n-a-dona :  3
n-a-agna :  2
n-a-dena :  2
n-a-óna :  8
n-a-anam :  3
n-a-na :  4
n-a-nach :  1


In [62]:
target_frequencies[""]

KeyError: ignored

In [174]:
print((target_frequencies["móire"] / sum(target_frequencies.values())) * (source_to_target_frequencies["mbliadhain-móire"]))
print(((target_frequencies["mbliain"] / sum(target_frequencies.values())) * (source_to_target_frequencies["mbliadhain-mbliain"])))

7.883851162744861e-05
1.4782220930146615e-05


In [175]:
print(target_frequencies["móire"], target_frequencies["mbliain"])

64 6


In [154]:
"'un" in list(adv.stopwords["irish"])

False

In [135]:
target_frequencies['bhaile']

1241

In [24]:
val_indices = {}
for ind,val in enumerate(x):
  if val != 0:
    val_indices[ind] = val

In [30]:
print(sum(val_indices.values()), len(val_indices), max(val_indices.values()), max(val_indices, key=val_indices.get))

3.695876933785215e-06 69 1.242070311196917e-06 21712


In [31]:
target_word_index[21712]

'a'

In [None]:
target_word_index

In [22]:
len(input_word_index)

31039

In [None]:
corpus_bleu(ref,pred)

0.13979355309562722

In [None]:
print(input_text[0].split(" "))
target = predict(input_text[0].split(" "),target_words, target_word_index, target_frequencies, source_to_target_frequencies)

['cinnte', 'go', 'leór', ',', 'thiocfadh', 'dóbhtha', 'bás', 'a', 'fhagháil', 'ar', 'imeall', 'an', 'phuill', 'udaí', '.']


In [None]:
source_to_target_frequencies["scéal-scéal"]

244

In [None]:
pred[0]

['a', ',', 'a', ',', 'an', 'mo', 'a', 'an', 'na', ',']

In [43]:
source_to_target_frequencies["scéal-is"]

3

In [39]:
target_frequencies["an"]

35500

In [45]:
t_f = {k: v for k, v in sorted(target_frequencies.items(), key=lambda item: item[1], reverse=True)}

In [49]:
print((t_f["a"] / sum(t_f.values())) * (source_to_target_frequencies["scéal-a"]))
print(((t_f["scéal"] / sum(t_f.values())) * (source_to_target_frequencies["scéal-scéal"])))

1.0082952896453006
0.223625438231258


In [23]:
from difflib import SequenceMatcher

x = "d'fhágamar"
y = "d'fhág"

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio(), len(min(x,y))/len(max(x,y))

similar(x,y)

(0.75, 0.6)