In [128]:
import random
import math
random.seed(0)

In [129]:
ratio_test = 50 / (50 + 1800 + 100)
ratio_train = 1800 / (50 + 1800 + 100)
ratio_val = 100 / (50 + 1800 + 100)

In [130]:
def parse_sentences(filename):
    map = {}
    def parse_line(line):
        split = line.split("\t")
        if len(split) != 3:
            print("Unparsed line:\n", line)
        else:
            [id, _, sentence] = split
            map[int(id)] = sentence
    print("Parsing " + filename)
    for line in open(filename).read().split("\n"):
        parse_line(line)
    print("Parsing of " + filename + " finished :D")
    return map

In [131]:
polish_map = parse_sentences("downloads/pol_sentences.tsv")
english_map = parse_sentences("downloads/eng_sentences.tsv")

Parsing downloads/pol_sentences.tsv
Unparsed line:
 
Parsing of downloads/pol_sentences.tsv finished :D
Parsing downloads/eng_sentences.tsv
Unparsed line:
 
Parsing of downloads/eng_sentences.tsv finished :D


In [132]:
def parse_pairs(filename):
    pairs = []
    def parse_line(line):
        split = line.split("\t")
        if len(split) != 2:
            print("Unparsed line:\n" + line)
        else:
            [first, second] = line.split("\t")
            if first.isdigit() and second.isdigit():
                pairs.append([int(first), int(second)])
                
    for line in open(filename).read().split("\n"):
        parse_line(line)
    return pairs

In [133]:
def generate_pairs_of_sentences(pairs, first_map, second_map):
    pairs_of_sentences = []
    def add_pair(first_sentence, second_sentence):
        pairs_of_sentences.append([first_sentence, second_sentence])
    for [first, second] in pairs:
        if first in first_map and second in second_map:
            add_pair(first_map[first], second_map[second])
        if second in first_map and first in second_map:
            add_pair(first_map[second], second_map[first])
    return pairs_of_sentences

In [134]:
# maximal size of file on github is 100MB, thus we splitted [sentence_base.csv] into two files
pairs = parse_pairs("downloads/sentences_base_1.csv") + parse_pairs("downloads/sentences_base_2.csv")

Unparsed line:

Unparsed line:



In [135]:
pairs_of_sentences = generate_pairs_of_sentences(pairs, english_map, polish_map)
random.shuffle(pairs_of_sentences)

In [136]:
test_size = math.floor(ratio_test * len(pairs_of_sentences))
val_size = math.floor(ratio_test * len(pairs_of_sentences))
train_size = len(pairs_of_sentences) - test_size - val_size

In [137]:
def write_pairs(pairs, suffix):
    src = open("src-" + suffix + ".txt", "w")
    tgt = open("tgt-" + suffix + ".txt", "w")
    for [first, second] in pairs:
        src.write(first + "\n")
        tgt.write(second + "\n")
    src.close()
    tgt.close()

In [138]:
write_pairs(pairs_of_sentences[0 : test_size], "test")
write_pairs(pairs_of_sentences[test_size : test_size + val_size], "val")
write_pairs(pairs_of_sentences[test_size + val_size : test_size + val_size + train_size], "train")