In [1]:
import pickle
import operator
from __future__ import print_function
import random

training_file = "data/training.zh-en"
lexicon_file = "lexicon"

In [2]:
top_n = 4 #INCLUDING
top_n_NULL = 3 #INCLUDING
sentence_size_value_constraint = 10 #INCLUDING
percentage_of_one_occurence_words_to_UNK = 0.8
UNK = True

## Loads the paired sentences from the data file

In [53]:
with open(training_file, encoding='utf8') as f:
    paired_sentences = f.read().splitlines()
    
print(len(paired_sentences))

44016


In [77]:
source_TF = {}

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    chinese_side = pair_sentence[0].split(' ')
    for source_word in chinese_side:
        if source_word not in source_TF:
            source_TF[source_word] = 1
        else:
            source_TF[source_word] += 1

one_word_occurrences = []

for key in source_TF:
    if source_TF[key] == 1:
        one_word_occurrences.append(key)

#print(len(list(source_TF.keys())))
number_of_one_word_occurrences = len(one_word_occurrences)
#print(number_of_one_word_occurrences)
number_of_words_mapped_to_UNK = int(number_of_one_word_occurrences*percentage_of_one_occurence_words_to_UNK)
#print(number_of_words_mapped_to_UNK)

random.shuffle(one_word_occurrences)
#print(one_word_occurrences)

words_mapped_to_UNK = one_word_occurrences[:number_of_words_mapped_to_UNK]
print(words_mapped_to_UNK[0])

汗牛充栋


## Load the estimated IBM1 translation probabilities from the lexicon file

In [78]:
with open(lexicon_file, encoding='utf8') as f:
    dictionary_lines = f.read().splitlines()


translation_probs_ZH_to_EN = {}
translation_probs_EN_to_ZH = {}

for line in dictionary_lines:
    entries = line.split(' ')
    if (entries[0] not in translation_probs_ZH_to_EN and entries[0] not in words_mapped_to_UNK):
        translation_probs_ZH_to_EN[entries[0]] = {}
    if (entries[1] not in translation_probs_EN_to_ZH):
        translation_probs_EN_to_ZH[entries[1]] = {}
    if (entries[2] != "NA" and entries[0] not in words_mapped_to_UNK):
        translation_probs_ZH_to_EN[entries[0]][entries[1]] = float(entries[2])
    if (entries[3] != "NA"):
        translation_probs_EN_to_ZH[entries[1]][entries[0]] = float(entries[3])

In [56]:
# TEST CELL

#translation_probs_ZH_to_EN["<NULL>"]
print(translation_probs_EN_to_ZH["<NULL>"]["在"])
print(list(translation_probs_ZH_to_EN["<NULL>"])[:20]) # NOT ORDERED BY MOST LIKELY
print(list(translation_probs_ZH_to_EN[words_mapped_to_UNK[0]])[:20]) # NOT ORDERED BY MOST LIKELY

0.0052879005670547485
['wiper', 'au', 'shin', 'produced', 'confirm', 'spouses', 'two-year', 'climbs', 'ache', 'defective', 'norm', 'pants', 'oranges', 'shaded', 'gallons', 'souvenirs', 'slim', 'presently', 'magnifying', 'finest']


KeyError: '服侍'

## Select only the top_n most likely translations for each source word (and the top_n_NULL most likely translations for the NULL symbol)

In [79]:
top_n_translation_probs_ZH_to_EN = {}

for entry in translation_probs_ZH_to_EN:
    if entry != '<NULL>':
        entry_key = entry
        new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n])
    else:
        #entry_key = '-EPS-'
        entry_key = '<NULL>'
        new_entry = dict(sorted(translation_probs_ZH_to_EN[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n_NULL])
    
    top_n_translation_probs_ZH_to_EN[entry_key] = new_entry
    
    

top_n_translation_probs_EN_to_ZH = {}

for entry in translation_probs_EN_to_ZH:
    if entry != '<NULL>':
        entry_key = entry
        new_entry = dict(sorted(translation_probs_EN_to_ZH[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n])
    else:
        #entry_key = '-EPS-'
        entry_key = '<NULL>'
        new_entry = dict(sorted(translation_probs_EN_to_ZH[entry].items(), key=operator.itemgetter(1), reverse=True)[:top_n_NULL])
    
    top_n_translation_probs_EN_to_ZH[entry_key] = new_entry
    
    
pickle.dump(top_n_translation_probs_ZH_to_EN, open('data/top' + str(top_n) + '_topNULL' + str(top_n_NULL) +
                                                   '_%unseen' + str(percentage_of_one_occurence_words_to_UNK) +
                                                   '_translation_probs_ZH_to_EN.mem', 'wb'))
pickle.dump(top_n_translation_probs_EN_to_ZH, open('data/top' + str(top_n) + '_topNULL' + str(top_n_NULL) + '_translation_probs_EN_to_ZH.mem', 'wb'))

## Selects only those paired sentences whose target sentence length is smaller than the specified length constraint

In [58]:
size_constrained_paired_sentences = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    chinese_side = pair_sentence[0].split(' ')
    english_side = pair_sentence[1].split(' ')
    if (len(english_side) <= sentence_size_value_constraint and len(chinese_side) <= sentence_size_value_constraint):
        size_constrained_paired_sentences.append(pair)
        
print("Number of remaining sentence pairs given target sentence length constraint: " +
      str(len(size_constrained_paired_sentences)))

Number of remaining sentence pairs given target sentence length constraint: 33255


## Conversion to -UNK- of unobserved types in the constrained top-n lexicon

In [80]:
if UNK:
    size_and_UNK_constrained_paired_sentences = []

    number_of_sentences = len(size_constrained_paired_sentences)

    for i, pair in enumerate(size_constrained_paired_sentences):
        pair_sentence = pair.split(' ||| ')
        chinese_side = pair_sentence[0].split(' ')
        english_side = pair_sentence[1].split(' ')


        english_side_set = set(english_side)

        chinese_UNK_sentence = []
        english_UNK_sentence = []

        
        set_of_possible_translations = []
        for key in top_n_translation_probs_ZH_to_EN['<NULL>']:
            set_of_possible_translations.append(key)
            
        for chinese_word in chinese_side:
            if chinese_word not in words_mapped_to_UNK:
                for key in top_n_translation_probs_ZH_to_EN[chinese_word]:
                    set_of_possible_translations.append(key)
                chinese_UNK_sentence.append(chinese_word)
            else:
                chinese_UNK_sentence.append('-UNK-')


        set_of_possible_translations = set(set_of_possible_translations)

        for english_word in english_side:
            if(english_word not in set_of_possible_translations):
                english_UNK_sentence.append('-UNK-')
            else:
                english_UNK_sentence.append(english_word)

        new_pair = chinese_UNK_sentence
        new_pair.append('|||')
        new_pair = new_pair + english_UNK_sentence

        new_pair = ' '.join(str(e) for e in new_pair)
        size_and_UNK_constrained_paired_sentences.append(new_pair)

        if (i % 10 == 0 or i + 1 == number_of_sentences):
            print('\r' + 'Converting to UNK... ' + str(100.0*(i+1)/number_of_sentences) + '% sentences processed so far.', end='')
            
else:
    print('UNK option was not selected! No conversion to UNK performed.')

Converting to UNK... 100.0% sentences processed so far.

## Determines the partition limits of the training data for dividing the original data file into 3 similar parts

In [81]:
if UNK:
    paired_sentences_to_use = size_and_UNK_constrained_paired_sentences
else:
    paired_sentences_to_use = size_constrained_paired_sentences
    
number_of_training_examples = len(paired_sentences_to_use)

if(number_of_training_examples % 3 == 0):
    number_of_subset_training_examples = number_of_training_examples/3
else:
    number_of_subset_training_examples12 = number_of_training_examples/3
    number_of_subset_training_examples3 = number_of_training_examples - 2*number_of_subset_training_examples12

# Creates 3 new training data files, which are basically disjoint sets of the original file and where their union is the entire training data (on constrained length)

In [82]:
if UNK:
    unk = '_UNK'
else:
    unk = '_noUNK'

low_limit = 0

# First Subset
if(number_of_training_examples % 3 == 0):
    high_limit = number_of_subset_training_examples
else:
    high_limit = number_of_subset_training_examples12

f = open('data/training_subset1_size' + str(sentence_size_value_constraint)+
         '_top'+str(top_n)+ '_topNULL'+str(top_n_NULL)+'_%unseen'+
         str(percentage_of_one_occurence_words_to_UNK)+unk+'.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(paired_sentences_to_use[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Second Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 2*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12

f = open('data/training_subset2_size' + str(sentence_size_value_constraint)+
         '_top'+str(top_n)+ '_topNULL'+str(top_n_NULL)+'_%unseen'+
         str(percentage_of_one_occurence_words_to_UNK)+unk+'.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(paired_sentences_to_use[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()


# Third Subset
if(number_of_training_examples % 3 == 0):
    high_limit = 3*number_of_subset_training_examples
else:
    high_limit = 2*number_of_subset_training_examples12 + number_of_subset_training_examples3

f = open('data/training_subset3_size' + str(sentence_size_value_constraint)+
         '_top'+str(top_n)+ '_topNULL'+str(top_n_NULL)+'_%unseen'+
         str(percentage_of_one_occurence_words_to_UNK)+unk+'.zh-en', 'w', encoding='utf8')
while(low_limit < high_limit):
    f.write(paired_sentences_to_use[low_limit])
    f.write('\n')
    low_limit = low_limit + 1
f.close()

## Split the original training set into seperate chinese and english corpus (used for word2vec, maybe)

In [3]:
chinese_corpus = []
english_corpus = []

for pair in paired_sentences:
    pair_sentence = pair.split(' ||| ')
    chinese_corpus.append(pair_sentence[0])
    english_corpus.append(pair_sentence[1])
    
    
f = open('data/chinese.zh-en', 'w')
for entry in chinese_corpus:
    f.write(entry)
    f.write('\n')
f.close()


f = open('data/english.zh-en', 'w')
for entry in english_corpus:
    f.write(entry)
    f.write('\n')
f.close()

## Dev Test Part

In [9]:
dev_files = ["datamap/references_val/dev1.zh-en", "datamap/references_test/dev2.zh-en"]
references = ["datamap/references_val/reference", "datamap/references_test/reference"]
top_n_translation_probs_ZH_to_EN_path = "datamap/top5_topNULL2_%unseen0.8_translation_probs_ZH_to_EN.mem"

paired_sentences_dev = [0, 0]
chinese_sentences = [0, 0]
chinese_sentences_path = ["datamap/references_val/chinese_val.zh", "datamap/references_test/chinese_test.zh"]

In [4]:
with open(top_n_translation_probs_ZH_to_EN_path, 'rb') as pickle_file:
    top_n_translation_probs_ZH_to_EN = pickle.load(pickle_file)

In [12]:
import os

for i, file in enumerate(dev_files):
    with open(file, encoding='utf8') as f:
        paired_sentences_dev[i] = f.read().splitlines()


print(paired_sentences_dev[0][12])
max_number_of_possible_translations = [0, 0]
for l, dev_set in enumerate(paired_sentences_dev):
    for i, translation in enumerate(dev_set):
        dev_set[i] = translation.split(' ||| ')
        for j, sentence in enumerate(dev_set[i]):
            if j > max_number_of_possible_translations[l]:
                max_number_of_possible_translations[l] = j
            if (j == 0):
                sentence_set_of_possible_translations = []
            dev_set[i][j] = sentence.split(' ')
            for k, word in enumerate(dev_set[i][j]):
                if (j == 0):
                    for key in top_n_translation_probs_ZH_to_EN['<NULL>']:
                        sentence_set_of_possible_translations.append(key)
                    if word not in top_n_translation_probs_ZH_to_EN:
                        dev_set[i][j][k] = "-UNK-"
                    else:
                        for key in top_n_translation_probs_ZH_to_EN[word]:
                            sentence_set_of_possible_translations.append(key)
                    sentence_set_of_possible_translations = list(set(sentence_set_of_possible_translations))
                else:
                    if word not in sentence_set_of_possible_translations:
                        dev_set[i][j][k] = "-UNK-"
                        
            #if (l == 0 and i == 12 and j == 0):
            #    print(sentence_set_of_possible_translations)

            
            
            
for l, dev_set in enumerate(paired_sentences_dev):
    for i, translation in enumerate(dev_set):
        for j, sentence in enumerate(dev_set[i]):
            if j == 0:
                filename = chinese_sentences_path[l]
                
                if os.path.exists(filename):
                    os.remove(filename)
                continue
            
            filename = references[l] + str(j)

            if os.path.exists(filename):
                os.remove(filename)
                
        if j < max_number_of_possible_translations[l]:
            for k in range(max_number_of_possible_translations[l]-j-1):
                filename = references[l] + str(j+k+1)

                if os.path.exists(filename):
                    os.remove(filename)
                    
                    

            
for l, dev_set in enumerate(paired_sentences_dev):
    for i, translation in enumerate(dev_set):
        for j, sentence in enumerate(dev_set[i]):
            if j == 0:
                chinese_sentence = ' '.join(str(word) for word in sentence) + '\n'
                filename = chinese_sentences_path[l]
                
                if os.path.exists(filename):
                    append_write = 'a' # append if already exists
                else:
                    append_write = 'w' # make a new file if not

                with open(filename, append_write) as reference_file:
                    reference_file.write(chinese_sentence)
                continue
            
            reference_translation = ' '.join(str(word) for word in sentence) + '\n'
            
            filename = references[l] + str(j)

            if os.path.exists(filename):
                append_write = 'a' # append if already exists
            else:
                append_write = 'w' # make a new file if not

            with open(filename, append_write) as reference_file:
                reference_file.write(reference_translation)
                
        if j < max_number_of_possible_translations[l]:
            for k in range(max_number_of_possible_translations[l]-j-1):
                reference_translation = '\n'
            
                filename = references[l] + str(j+k+1)

                if os.path.exists(filename):
                    append_write = 'a' # append if already exists
                else:
                    append_write = 'w' # make a new file if not

                with open(filename, append_write) as reference_file:
                    reference_file.write(reference_translation)

    
print(max_number_of_possible_translations[0])
print(max_number_of_possible_translations[1])

print(len(paired_sentences_dev[0]))
print(len(paired_sentences_dev[1]))

我 很 讨厌 青椒 。 ||| i hate green peppers . ||| i despise green peppers . ||| i can 't stand green peppers . ||| i just plain hate green peppers . ||| i detest green peppers . ||| i can 't stand green peppers . ||| green peppers make me sick . ||| i can 't stand green peppers . ||| i don 't like green peppers , at all . ||| green peppers are something i simply can 't stand . ||| i don 't like green peppers . ||| i dislike green peppers . ||| i 'm not keen on green peppers . ||| i dislike green peppers . ||| i don 't like green peppers . ||| green peppers taste terrible .
16
16
500
506


In [43]:
print(paired_sentences_dev[0][12])

[['我', '很', '讨厌', '-UNK-', '。'], ['i', 'hate', '-UNK-', '-UNK-', '.'], ['i', '-UNK-', '-UNK-', '-UNK-', '.'], ['i', '-UNK-', "'t", '-UNK-', '-UNK-', '-UNK-', '.'], ['i', '-UNK-', '-UNK-', 'hate', '-UNK-', '-UNK-', '.'], ['i', '-UNK-', '-UNK-', '-UNK-', '.'], ['i', '-UNK-', "'t", '-UNK-', '-UNK-', '-UNK-', '.'], ['-UNK-', '-UNK-', '-UNK-', '-UNK-', '-UNK-', '.'], ['i', '-UNK-', "'t", '-UNK-', '-UNK-', '-UNK-', '.'], ['i', 'don', "'t", '-UNK-', '-UNK-', '-UNK-', ',', '-UNK-', '-UNK-', '.'], ['-UNK-', '-UNK-', '-UNK-', '-UNK-', 'i', '-UNK-', '-UNK-', "'t", '-UNK-', '.'], ['i', 'don', "'t", '-UNK-', '-UNK-', '-UNK-', '.'], ['i', 'dislike', '-UNK-', '-UNK-', '.'], ['i', "'m", 'not', '-UNK-', '-UNK-', '-UNK-', '-UNK-', '.'], ['i', 'dislike', '-UNK-', '-UNK-', '.'], ['i', 'don', "'t", '-UNK-', '-UNK-', '-UNK-', '.'], ['-UNK-', '-UNK-', '-UNK-', '-UNK-', '.']]


In [6]:
import subprocess
number_of_reference_files = 3
references = ""
for i in range(number_of_reference_files):
    references += " reference" + str(i)
    
print(references)

output = subprocess.check_output("perl multi-bleu.perl -lc" + references + " < hypotheses", shell=True)
print("program output: " + str(output))
output = str(output)
print(output[2:][:-3])
output = output[2:][:-3].split(' ')
print(output)
for i, value in enumerate(output):
    output[i] = float(value)
print(output)


 reference0 reference1 reference2
program output: b'90.36 100.00 100.00 100.00 66.67\n'
90.36 100.00 100.00 100.00 66.67
['90.36', '100.00', '100.00', '100.00', '66.67']
[90.36, 100.0, 100.0, 100.0, 66.67]


In [83]:
elems = [1,2]

for i in range(3):
    for j, ele in enumerate(elems):
        print(j)
    if j < 4:
        for k in range(4-j-1):
            print(j+k+1)
    

0
1
2
3
0
1
2
3
0
1
2
3
