In [1]:
import pickle as pickle
import operator
from __future__ import print_function

training_file = "data/training.zh-en"
lexicon_file = "lexicon"

In [2]:
top_n = 5 #INCLUDING
top_n_NULL = 5 #INCLUDING
sentence_size_value_constraint = 10 #INCLUDING
UNK = False

## Load the estimated IBM1 translation probabilities from the lexicon file

In [3]:
with open(lexicon_file, encoding='utf8') as f:
    dictionary_lines = f.read().splitlines()


translation_probs_ZH_to_EN = {}
translation_probs_EN_to_ZH = {}

for line in dictionary_lines:
    entries = line.split(' ')
    if (entries[0] not in translation_probs_ZH_to_EN):
        translation_probs_ZH_to_EN[entries[0]] = {}
    if (entries[1] not in translation_probs_EN_to_ZH):
        translation_probs_EN_to_ZH[entries[1]] = {}
    if (entries[2] != "NA"):
        translation_probs_ZH_to_EN[entries[0]][entries[1]] = float(entries[2])
    if (entries[3] != "NA"):
        translation_probs_EN_to_ZH[entries[1]][entries[0]] = float(entries[3])

In [4]:
# TEST CELL

#translation_probs_ZH_to_EN["<NULL>"]
print(translation_probs_EN_to_ZH["<NULL>"]["在"])
print(list(translation_probs_ZH_to_EN["<NULL>"])[:20]) # NOT ORDERED BY MOST LIKELY

0.0052879005670547485
['wiper', 'au', 'shin', 'produced', 'confirm', 'spouses', 'two-year', 'climbs', 'ache', 'defective', 'norm', 'pants', 'oranges', 'shaded', 'gallons', 'souvenirs', 'slim', 'presently', 'magnifying', 'finest']


## Select only the top_n most likely translations for each source word (and the top_n_NULL most likely translations for the NULL symbol)

In [5]:
top_n_translation_probs_ZH_to_EN = {}

for entry in translation_probs_ZH_to_EN:
    if entry != '<NULL>':
        entry_key = entry
        new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n])
    else:
        #entry_key = '-EPS-'
        entry_key = '<NULL>'
        new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n_NULL])
    
    top_n_translation_probs_ZH_to_EN[entry_key] = new_entry
    
    

top_n_translation_probs_EN_to_ZH = {}

for entry in translation_probs_EN_to_ZH:
    if entry != '<NULL>':
        entry_key = entry
        new_entry = dict(sorted(translation_probs_EN_to_ZH[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n])
    else:
        #entry_key = '-EPS-'
        entry_key = '<NULL>'
        new_entry = dict(sorted(translation_probs_EN_to_ZH[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n_NULL])
    
    top_n_translation_probs_EN_to_ZH[entry_key] = new_entry
    
    
pickle.dump(top_n_translation_probs_ZH_to_EN, open('data/top' + str(top_n) + '_topNULL' + str(top_n_NULL) + '_translation_probs_ZH_to_EN.mem', 'wb'))
pickle.dump(top_n_translation_probs_EN_to_ZH, open('data/top' + str(top_n) + '_topNULL' + str(top_n_NULL) + '_translation_probs_EN_to_ZH.mem', 'wb'))

## Loads the paired sentences from the data file

In [6]:
with open(training_file, encoding='utf8') as f:
    paired_sentences = f.read().splitlines()
    
print(len(paired_sentences))

44016


## Selects only those paired sentences whose target sentence length is smaller than the specified length constraint

In [7]:
size_constrained_paired_sentences = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    english_side = pair_sentence[1].split(' ')
    chinese_side = pair_sentence[0].split(' ')
    if (len(english_side) <= sentence_size_value_constraint and len(chinese_side) <= sentence_size_value_constraint):
        size_constrained_paired_sentences.append(pair)
        
print("Number of remaining sentence pairs given target sentence length constraint: " +
      str(len(size_constrained_paired_sentences)))

Number of remaining sentence pairs given target sentence length constraint: 33255


## Conversion to -UNK- of unobserved types in the constrained top-n lexicon

In [8]:
# THIS NEEDS TO BE CHECKED AND IMPROVED
if UNK:
    target_size_and_UNK_constrained_paired_sentences = []

    number_of_sentences = len(size_constrained_paired_sentences)

    for i, pair in enumerate(size_constrained_paired_sentences):
        pair_sentence = pair.split(' ||| ')
        chinese_side = pair_sentence[0].split(' ')
        english_side = pair_sentence[1].split(' ')


        english_side_set = set(english_side)

        chinese_UNK_sentence = []
        english_UNK_sentence = []

        set_of_possible_translations = []
        for chinese_word in chinese_side:
            possible_translations_for_chinese_word = []
            for key in top_n_translation_probs_ZH_to_EN[chinese_word]:
                possible_translations_for_chinese_word.append(key)
                set_of_possible_translations.append(key)
            if(len(list(set(possible_translations_for_chinese_word) & english_side_set)) == 0):
                chinese_UNK_sentence.append('-UNK-')
            else:
                chinese_UNK_sentence.append(chinese_word)


        set_of_possible_translations = set(set_of_possible_translations)

        for english_word in english_side:
            if(english_word not in set_of_possible_translations):
                english_UNK_sentence.append('-UNK-')
            else:
                english_UNK_sentence.append(english_word)

        new_pair = chinese_UNK_sentence
        new_pair.append('|||')
        new_pair = new_pair + english_UNK_sentence

        new_pair = ' '.join(str(e) for e in new_pair)
        target_size_and_UNK_constrained_paired_sentences.append(new_pair)

        if (i % 10 == 0 or i + 1 == number_of_sentences):
            print('\r' + 'Converting to UNK... ' + str(100.0*(i+1)/number_of_sentences) + '% sentences processed so far.', end='')
            
else:
    print('UNK option was not selected! No conversion to UNK performed.')

UNK option was not selected! No conversion to UNK performed.


## Determines the partition limits of the training data for dividing the original data file into 3 similar parts

In [9]:
if UNK:
    paired_sentences_to_use = size_and_UNK_constrained_paired_sentences
else:
    paired_sentences_to_use = size_constrained_paired_sentences
    
number_of_training_examples = len(paired_sentences_to_use)

if(number_of_training_examples % 3 == 0):
    number_of_subset_training_examples = number_of_training_examples/3
else:
    number_of_subset_training_examples12 = number_of_training_examples/3
    number_of_subset_training_examples3 = number_of_training_examples - 2*number_of_subset_training_examples12

# Creates 3 new training data files, which are basically disjoint sets of the original file and where their union is the entire training data (on constrained length)

In [10]:
if UNK:
    unk = '_UNK'
else:
    unk = '_noUNK'

low_limit = 0

# First Subset
if(number_of_training_examples % 3 == 0):
    high_limit = number_of_subset_training_examples
else:
    high_limit = number_of_subset_training_examples12

f = open('data/training_subset1_size' + str(sentence_size_value_constraint)+
         '_top'+str(top_n)+ '_topNULL'+str(top_n_NULL)+unk+'.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(paired_sentences_to_use[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Second Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 2*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12

f = open('data/training_subset2_size' + str(sentence_size_value_constraint)+
         '_top'+str(top_n)+ '_topNULL'+str(top_n_NULL)+unk+'.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(paired_sentences_to_use[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Third Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 3*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12 + number_of_subset_training_examples3

f = open('data/training_subset3_size' + str(sentence_size_value_constraint)+
         '_top'+str(top_n)+ '_topNULL'+str(top_n_NULL)+unk+'.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(paired_sentences_to_use[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()

## Split the original training set into seperate chinese and english corpus (used for word2vec, maybe)

In [3]:
chinese_corpus = []
english_corpus = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    chinese_corpus.append(pair_sentence[0])
    english_corpus.append(pair_sentence[1])
    
    
f = open('data/chinese.zh-en', 'w')
for entry in chinese_corpus:
    f.write(entry)
    f.write('\n')
f.close()


f = open('data/english.zh-en', 'w')
for entry in english_corpus:
    f.write(entry)
    f.write('\n')
f.close()

## EXTRA CELLS FOR MOSTLY USELESS STUFF (just for checking, currently)

In [7]:
# Calculating longest sentence size for target language
# Calculating number of target sentences with length greater than a certain value


max_ = 0
count = 0
for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    
    english_side = pair_sentence[1].split(' ')
    chinese_side = pair_sentence[0].split(' ')
    if (len(english_side) > max_):
        max_ = len(english_side)
        max_sentence = english_side
    if len(english_side) > sentence_size_value_constraint or (len(chinese_side) > sentence_size_value_constraint):
        count = count + 1
        
print("Maximum target sentence length: " + str(max_))
print(max_sentence)
print("\n" + "Number of excluded sentence pairs given target sentence length constraint: " + str(count))
print("Number of remaining sentence pairs given target sentence length constraint: " + 
      str(len(paired_sentences) - count))

Maximum target sentence length: 65
['"', 'y', '"', 'as', 'in', 'yokohama', ',', '"', 'a', '"', 'as', 'in', 'america', ',', '"', 'm', '"', 'as', 'in', 'mexico', ',', '"', 'a', '"', 'as', 'in', 'america', ',', '"', 'g', '"', 'as', 'in', 'germany', ',', '"', 'u', '"', 'as', 'in', 'union', ',', '"', 'c', '"', 'as', 'in', 'china', ',', '"', 'h', '"', 'as', 'in', 'hong', 'kong', ',', 'and', '"', 'i', '"', 'as', 'in', 'italy', '.']

Number of excluded sentence pairs given target sentence length constraint: 10761
Number of remaining sentence pairs given target sentence length constraint: 33255
