In [2]:
import cPickle
import operator
from __future__ import print_function

training_file = "data/training.zh-en"
lexicon_file = "lexicon"

In [2]:
# Load the estimated IBM1 translation probabilities from the lexicon file

with open(lexicon_file) as f:
    dictionary_lines = f.read().splitlines()


translation_probs_ZH_to_EN = {}

for line in dictionary_lines:
    entries = line.split(' ')
    if (entries[0] not in translation_probs_ZH_to_EN):
        translation_probs_ZH_to_EN[entries[0]] = {}
    if (entries[2] != "NA"):
        translation_probs_ZH_to_EN[entries[0]][entries[1]] = float(entries[2])

In [30]:
# Select only the top_n most likely translation for each source (Chinese) word

top_n = 5

top_n_translation_probs_ZH_to_EN = {}

for entry in translation_probs_ZH_to_EN:
    new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].iteritems(), key=operator.itemgetter(1), reverse=True)[:top_n])
    top_n_translation_probs_ZH_to_EN[entry] = new_entry

In [31]:
top_n_translation_probs_ZH_to_EN["在"]

{'at': 0.07681587338447571,
 'i': 0.07119250297546387,
 'in': 0.11082739382982254,
 'is': 0.10617364197969437,
 'the': 0.4110737442970276}

In [3]:
# Loads the paired sentences from the data file

with open(training_file) as f:
    paired_sentences = f.read().splitlines()
    
print(len(paired_sentences))

44016


In [25]:
# Calculating longest sentence size for target language
# Calculating number of target sentences with length greater than a certain value

target_sentence_size_value_constraint = 5 #INCLUDING

max_ = 0
count = 0
for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    english_side = pair_sentence[1].split(' ')
    if (len(english_side) > max_):
        max_ = len(english_side)
        max_sentence = english_side
    if (len(english_side) > target_sentence_size_value_constraint):
        count = count + 1
        
print("Maximum target sentence length: " + str(max_))
print(max_sentence)
print("\n" + "Number of excluded sentence pairs given target sentence length constraint: " + str(count))
print("Number of remaining sentence pairs given target sentence length constraint: " + 
      str(len(paired_sentences) - count))

Maximum target sentence length: 65
['"', 'y', '"', 'as', 'in', 'yokohama', ',', '"', 'a', '"', 'as', 'in', 'america', ',', '"', 'm', '"', 'as', 'in', 'mexico', ',', '"', 'a', '"', 'as', 'in', 'america', ',', '"', 'g', '"', 'as', 'in', 'germany', ',', '"', 'u', '"', 'as', 'in', 'union', ',', '"', 'c', '"', 'as', 'in', 'china', ',', '"', 'h', '"', 'as', 'in', 'hong', 'kong', ',', 'and', '"', 'i', '"', 'as', 'in', 'italy', '.']

Number of excluded sentence pairs given target sentence length constraint: 35119
Number of remaining sentence pairs given target sentence length constraint: 8897


In [26]:
# Selects only those paired sentences whose target sentence length
# is smaller than the specified length constraint

target_size_constrained_paired_sentences = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    english_side = pair_sentence[1].split(' ')
    if (len(english_side) <= target_sentence_size_value_constraint):
        target_size_constrained_paired_sentences.append(pair)
        
print("Number of remaining sentence pairs given target sentence length constraint: " +
      str(len(target_size_constrained_paired_sentences)))

Number of remaining sentence pairs given target sentence length constraint: 8897


In [32]:
# Conversion to -UNK- of unobserved types in the constrained top-n lexicon


target_size_and_UNK_constrained_paired_sentences = []

number_of_sentences = len(target_size_constrained_paired_sentences)

for i, pair in enumerate(target_size_constrained_paired_sentences):
    pair_sentence = pair.split(' ||| ')
    chinese_side = pair_sentence[0].split(' ')
    english_side = pair_sentence[1].split(' ')


    english_side_set = set(english_side)

    chinese_UNK_sentence = []
    english_UNK_sentence = []

    set_of_possible_translations = []
    for chinese_word in chinese_side:
        possible_translations_for_chinese_word = []
        for key in top_n_translation_probs_ZH_to_EN[chinese_word]:
            possible_translations_for_chinese_word.append(key)
            set_of_possible_translations.append(key)
        if(len(list(set(possible_translations_for_chinese_word) & english_side_set)) == 0):
            chinese_UNK_sentence.append('-UNK-')
        else:
            chinese_UNK_sentence.append(chinese_word)


    set_of_possible_translations = set(set_of_possible_translations)

    for english_word in english_side:
        if(english_word not in set_of_possible_translations):
            english_UNK_sentence.append('-UNK-')
        else:
            english_UNK_sentence.append(english_word)

    new_pair = chinese_UNK_sentence
    new_pair.append('|||')
    new_pair = new_pair + english_UNK_sentence

    new_pair = ' '.join(str(e) for e in new_pair)
    target_size_and_UNK_constrained_paired_sentences.append(new_pair)

    if (i % 10 == 0 or i + 1 == number_of_sentences):
        print('\r' + 'Converting to UNK... ' + str(100.0*(i+1)/number_of_sentences) + '% sentences processed so far.', end='')

Converting to UNK... 0.0112397437338% sentences processed so far.Converting to UNK... 0.123637181072% sentences processed so far.Converting to UNK... 0.236034618411% sentences processed so far.Converting to UNK... 0.348432055749% sentences processed so far.Converting to UNK... 0.460829493088% sentences processed so far.Converting to UNK... 0.573226930426% sentences processed so far.Converting to UNK... 0.685624367764% sentences processed so far.Converting to UNK... 0.798021805103% sentences processed so far.Converting to UNK... 0.910419242441% sentences processed so far.Converting to UNK... 1.02281667978% sentences processed so far.Converting to UNK... 1.13521411712% sentences processed so far.Converting to UNK... 1.24761155446% sentences processed so far.Converting to UNK... 1.36000899179% sentences processed so far.Converting to UNK... 1.47240642913% sentences processed so far.Converting to UNK... 1.58480386647% sentences processed so far.Converting to UNK... 1.697201

In [33]:
# Determines the partition limits of the training data
# for dividing the original data file into 3 similar parts


number_of_training_examples = len(target_size_and_UNK_constrained_paired_sentences)

if(number_of_training_examples % 3 == 0):
    number_of_subset_training_examples = number_of_training_examples/3
else:
    number_of_subset_training_examples12 = number_of_training_examples/3
    number_of_subset_training_examples3 = number_of_training_examples - 2*number_of_subset_training_examples12

In [34]:
# Creates 3 new training data files, which are
# basically disjoint sets of the original file
# and where their union is the entire training data (on constrained length)


low_limit = 0

# First Subset
if(number_of_training_examples % 3 == 0):
    high_limit = number_of_subset_training_examples
else:
    high_limit = number_of_subset_training_examples12

f = open('data/training_subset1_size' + str(target_sentence_size_value_constraint) + 
         '_top' + str(top_n) + '.zh-en', 'w')
while(low_limit < high_limit):
    f.write(target_size_and_UNK_constrained_paired_sentences[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Second Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 2*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12

f = open('data/training_subset2_size' + str(target_sentence_size_value_constraint) + 
         '_top' + str(top_n) + '.zh-en', 'w')
while(low_limit < high_limit):
    f.write(target_size_and_UNK_constrained_paired_sentences[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Third Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 3*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12 + number_of_subset_training_examples3

f = open('data/training_subset3_size' + str(target_sentence_size_value_constraint) + 
         '_top' + str(top_n) + '.zh-en', 'w')
while(low_limit < high_limit):
    f.write(target_size_and_UNK_constrained_paired_sentences[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()

In [4]:
# Split the original training set into seperate chinese and english corpus

chinese_corpus = []
english_corpus = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    chinese_corpus.append(pair_sentence[0])
    english_corpus.append(pair_sentence[1])
    
    
f = open('data/chinese.zh-en', 'w')
for entry in chinese_corpus:
    f.write(entry)
    f.write('\n')
f.close()


f = open('data/english.zh-en', 'w')
for entry in english_corpus:
    f.write(entry)
    f.write('\n')
f.close()