In [1]:
import pickle as pickle
import operator
from __future__ import print_function

training_file = "data/training.zh-en"
lexicon_file = "lexicon"

In [2]:
# Load the estimated IBM1 translation probabilities from the lexicon file

with open(lexicon_file, encoding='utf8') as f:
    dictionary_lines = f.read().splitlines()


translation_probs_ZH_to_EN = {}
translation_probs_EN_to_ZH = {}

for line in dictionary_lines:
    entries = line.split(' ')
    if (entries[0] not in translation_probs_ZH_to_EN):
        translation_probs_ZH_to_EN[entries[0]] = {}
    if (entries[1] not in translation_probs_EN_to_ZH):
        translation_probs_EN_to_ZH[entries[1]] = {}
    if (entries[2] != "NA"):
        translation_probs_ZH_to_EN[entries[0]][entries[1]] = float(entries[2])
    if (entries[3] != "NA"):
        translation_probs_EN_to_ZH[entries[1]][entries[0]] = float(entries[3])

In [5]:
#translation_probs_ZH_to_EN["<NULL>"]
print(translation_probs_EN_to_ZH["<NULL>"]["在"])
print(list(translation_probs_ZH_to_EN["<NULL>"])[:20]) # NOT ORDERED BY MOST LIKELY

0.0052879005670547485
['carousel', 'thrift', 'pin', 'copper', 'vacant', 'r2', 'walking', 'elastic', 'reconfirm', 'except', 'perugino', 'thinning', 'sitter', 'kitchen', 'equinoctial', 'hotels', 'expressed', 'tonight', 'swear', 'transporting']


In [5]:
# Select only the top_n most likely translation for each source (Chinese) word

top_n = 5
top_n_NULL = 50

top_n_translation_probs_ZH_to_EN = {}

for entry in translation_probs_ZH_to_EN:
    if entry != '<NULL>':
        entry_key = entry
        new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n])
    else:
        entry_key = '-EPS-'
        new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n_NULL])
    
    top_n_translation_probs_ZH_to_EN[entry_key] = new_entry
    
    

top_n_translation_probs_EN_to_ZH = {}

for entry in translation_probs_EN_to_ZH:
    if entry != '<NULL>':
        entry_key = entry
        new_entry = dict(sorted(translation_probs_EN_to_ZH[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n])
    else:
        entry_key = '-EPS-'
        new_entry = dict(sorted(translation_probs_EN_to_ZH[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n_NULL])
    
    top_n_translation_probs_EN_to_ZH[entry_key] = new_entry
    
    
pickle.dump(top_n_translation_probs_ZH_to_EN, open('data/top' + str(top_n) + '_translation_probs_ZH_to_EN.mem', 'wb'))
pickle.dump(top_n_translation_probs_EN_to_ZH, open('data/top' + str(top_n) + '_translation_probs_EN_to_ZH.mem', 'wb'))

In [13]:
top_n_translation_probs_ZH_to_EN["-EPS-"]

{"'s": 0.03214159235358238,
 ',': 0.029292521998286247,
 '.': 0.204543799161911,
 '?': 0.04402594268321991,
 'a': 0.08496548235416412,
 'are': 0.005562338046729565,
 'be': 0.00978745985776186,
 'can': 0.0044654677622020245,
 'do': 0.009767396375536919,
 'for': 0.01755404844880104,
 'get': 0.0022550863213837147,
 'have': 0.019168412312865257,
 'i': 0.10233193635940552,
 'in': 0.00846295990049839,
 'is': 0.05820150300860405,
 'it': 0.06706574559211731,
 'like': 0.003205162938684225,
 'me': 0.009197896346449852,
 'of': 0.01084513496607542,
 'please': 0.015496926382184029,
 'that': 0.005250311456620693,
 'the': 0.11601653695106506,
 'to': 0.07063718140125275,
 'would': 0.008942347951233387,
 'you': 0.044999897480010986}

In [43]:
# Loads the paired sentences from the data file

with open(training_file, encoding='utf8') as f:
    paired_sentences = f.read().splitlines()
    
print(len(paired_sentences))

44016


In [56]:
# Calculating longest sentence size for target language
# Calculating number of target sentences with length greater than a certain value

sentence_size_value_constraint = 5 #INCLUDING

max_ = 0
count = 0
for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    
    english_side = pair_sentence[1].split(' ')
    chinese_side = pair_sentence[0].split(' ')
    if (len(english_side) > max_):
        max_ = len(english_side)
        max_sentence = english_side
    if len(english_side) > sentence_size_value_constraint or (len(chinese_side) > sentence_size_value_constraint):
        count = count + 1
        
print("Maximum target sentence length: " + str(max_))
print(max_sentence)
print("\n" + "Number of excluded sentence pairs given target sentence length constraint: " + str(count))
print("Number of remaining sentence pairs given target sentence length constraint: " + 
      str(len(paired_sentences) - count))

Maximum target sentence length: 65
['"', 'y', '"', 'as', 'in', 'yokohama', ',', '"', 'a', '"', 'as', 'in', 'america', ',', '"', 'm', '"', 'as', 'in', 'mexico', ',', '"', 'a', '"', 'as', 'in', 'america', ',', '"', 'g', '"', 'as', 'in', 'germany', ',', '"', 'u', '"', 'as', 'in', 'union', ',', '"', 'c', '"', 'as', 'in', 'china', ',', '"', 'h', '"', 'as', 'in', 'hong', 'kong', ',', 'and', '"', 'i', '"', 'as', 'in', 'italy', '.']

Number of excluded sentence pairs given target sentence length constraint: 36838
Number of remaining sentence pairs given target sentence length constraint: 7178


In [57]:
# Selects only those paired sentences whose target sentence length
# is smaller than the specified length constraint

size_constrained_paired_sentences = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    english_side = pair_sentence[1].split(' ')
    chinese_side = pair_sentence[0].split(' ')
    if (len(english_side) <= sentence_size_value_constraint and len(chinese_side) <= sentence_size_value_constraint):
        size_constrained_paired_sentences.append(pair)
        
print("Number of remaining sentence pairs given target sentence length constraint: " +
      str(len(size_constrained_paired_sentences)))

Number of remaining sentence pairs given target sentence length constraint: 7178


In [51]:
# Conversion to -UNK- of unobserved types in the constrained top-n lexicon


target_size_and_UNK_constrained_paired_sentences = []

number_of_sentences = len(size_constrained_paired_sentences)

for i, pair in enumerate(size_constrained_paired_sentences):
    pair_sentence = pair.split(' ||| ')
    chinese_side = pair_sentence[0].split(' ')
    english_side = pair_sentence[1].split(' ')


    english_side_set = set(english_side)

    chinese_UNK_sentence = []
    english_UNK_sentence = []

    set_of_possible_translations = []
    for chinese_word in chinese_side:
        possible_translations_for_chinese_word = []
        for key in top_n_translation_probs_ZH_to_EN[chinese_word]:
            possible_translations_for_chinese_word.append(key)
            set_of_possible_translations.append(key)
        if(len(list(set(possible_translations_for_chinese_word) & english_side_set)) == 0):
            chinese_UNK_sentence.append('-UNK-')
        else:
            chinese_UNK_sentence.append(chinese_word)


    set_of_possible_translations = set(set_of_possible_translations)

    for english_word in english_side:
        if(english_word not in set_of_possible_translations):
            english_UNK_sentence.append('-UNK-')
        else:
            english_UNK_sentence.append(english_word)

    new_pair = chinese_UNK_sentence
    new_pair.append('|||')
    new_pair = new_pair + english_UNK_sentence

    new_pair = ' '.join(str(e) for e in new_pair)
    target_size_and_UNK_constrained_paired_sentences.append(new_pair)

    if (i % 10 == 0 or i + 1 == number_of_sentences):
        print('\r' + 'Converting to UNK... ' + str(100.0*(i+1)/number_of_sentences) + '% sentences processed so far.', end='')

Converting to UNK... 100.0% sentences processed so far.

In [52]:
# Determines the partition limits of the training data
# for dividing the original data file into 3 similar parts


number_of_training_examples = len(size_and_UNK_constrained_paired_sentences)

if(number_of_training_examples % 3 == 0):
    number_of_subset_training_examples = number_of_training_examples/3
else:
    number_of_subset_training_examples12 = number_of_training_examples/3
    number_of_subset_training_examples3 = number_of_training_examples - 2*number_of_subset_training_examples12

In [53]:
# Creates 3 new training data files, which are
# basically disjoint sets of the original file
# and where their union is the entire training data (on constrained length)


low_limit = 0

# First Subset
if(number_of_training_examples % 3 == 0):
    high_limit = number_of_subset_training_examples
else:
    high_limit = number_of_subset_training_examples12

f = open('data/training_subset1_size' + str(sentence_size_value_constraint) + 
         '_top' + str(top_n) + '.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(size_and_UNK_constrained_paired_sentences[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Second Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 2*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12

f = open('data/training_subset2_size' + str(sentence_size_value_constraint) + 
         '_top' + str(top_n) + '.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(size_and_UNK_constrained_paired_sentences[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Third Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 3*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12 + number_of_subset_training_examples3

f = open('data/training_subset3_size' + str(sentence_size_value_constraint) + 
         '_top' + str(top_n) + '.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(size_and_UNK_constrained_paired_sentences[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()

In [40]:
# Split the original training set into seperate chinese and english corpus

chinese_corpus = []
english_corpus = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    chinese_corpus.append(pair_sentence[0])
    english_corpus.append(pair_sentence[1])
    
    
f = open('data/chinese.zh-en', 'w')
for entry in chinese_corpus:
    f.write(entry)
    f.write('\n')
f.close()


f = open('data/english.zh-en', 'w')
for entry in english_corpus:
    f.write(entry)
    f.write('\n')
f.close()

UnicodeEncodeError: 'charmap' codec can't encode character '\u5728' in position 0: character maps to <undefined>