In [1]:
import random

# Use some of this, but remember to keep the data organized by pair, so that I can weigh the different language pairs

### Helper Functions

In [4]:
def open_file(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(line)
    return data

def combine_src_and_tgt(src, tgt):
    combined = []
    for i in range(len(src)):
        sentences = (src[i], tgt[i])
        combined.append(sentences)
    return combined

def get_random_sentences_from_file(path, n=-1):
    '''
    Finds the src and tgt file from the path and returns n sentences 
    from the files as a list of tuples with the src and tgt sentences.
    '''

    src_path = path + '.src.sp'
    tgt_path = path + '.tgt.sp'

    data = combine_src_and_tgt(open_file(src_path), open_file(tgt_path))

    if n == -1:
        sampled_indices = random.sample(range(len(data)), len(data))
    else:
        sampled_indices = random.sample(range(len(data)), n)

    sampled_data = [data[i] for i in sampled_indices]

    return sampled_data
    


### Add all the data to the test set

In [5]:
complete_test_set = []
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/english/km-en-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/english/lo-en-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/english/th-en-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/khmer/en-km-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/khmer/lo-km-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/khmer/th-km-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/lao/en-lo-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/lao/km-lo-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/lao/th-lo-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/thai/en-th-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/thai/lo-th-test', 500)
complete_test_set += get_random_sentences_from_file('cleaned-tokenized-tagged/thai/km-th-test', 500)

### Add all the data to the validation set

In [6]:
complete_val_set = []
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/english/km-en-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/english/lo-en-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/english/th-en-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/khmer/en-km-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/khmer/lo-km-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/khmer/th-km-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/lao/en-lo-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/lao/km-lo-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/lao/th-lo-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/thai/en-th-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/thai/lo-th-val', 500)
complete_val_set += get_random_sentences_from_file('cleaned-tokenized-tagged/thai/km-th-val', 500)

### Shuffle the data in both sets

In [7]:
random.shuffle(complete_test_set)
random.shuffle(complete_val_set)

### Write both data sets to files

In [8]:
def write_to_file(path, data):
    with open(path, 'w') as f:
        for s in data:
            f.write(s)

In [111]:
test_src = [pair[0] for pair in complete_test_set]
test_tgt = [pair[1] for pair in complete_test_set]

val_src = [pair[0] for pair in complete_val_set]
val_tgt = [pair[1] for pair in complete_val_set]

test_src_path = 'complete/data/tokenized/src-test.txt'
test_tgt_path = 'complete/data/tokenized/tgt-test.txt'
val_src_path = 'complete/data/tokenized/src-val.txt'
val_tgt_path = 'complete/data/tokenized/tgt-val.txt'


In [112]:
write_to_file(test_src_path, test_src)
write_to_file(test_tgt_path, test_tgt)
write_to_file(val_src_path, val_src)
write_to_file(val_tgt_path, val_tgt)

### Do this for the training data too!

In [115]:
complete_train_set = []
complete_train_set += get_random_sentences_from_file('cleaned-tagged/english/km-en-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/english/lo-en-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/english/th-en-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/khmer/en-km-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/khmer/lo-km-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/khmer/th-km-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/lao/en-lo-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/lao/km-lo-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/lao/th-lo-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/thai/en-th-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/thai/lo-th-train')
complete_train_set += get_random_sentences_from_file('cleaned-tagged/thai/km-th-train')

In [116]:
print(len(complete_train_set))

4138376


In [117]:
random.shuffle(complete_train_set)

In [118]:
train_src = [pair[0] for pair in complete_train_set]
train_tgt = [pair[1] for pair in complete_train_set]

train_src_path = 'complete/data/src-train.txt'
train_tgt_path = 'complete/data/tgt-train.txt'

write_to_file(train_src_path, train_src)
write_to_file(train_tgt_path, train_tgt)