In [178]:
import random
import math
random.seed(0)

In [179]:
ratio_test = 50 / (50 + 1800 + 100)
ratio_train = 1800 / (50 + 1800 + 100)
ratio_val = 100 / (50 + 1800 + 100)

In [180]:
def read_file(filename):
    return open(filename).read().split("\n")

def read_pairs(first_file, second_file):
    first_file = read_file(first_file)
    second_file = read_file(second_file)
    assert(len(first_file) == len(second_file))
    pairs = []
    for i in range(len(first_file)):
        pairs.append([first_file[i], second_file[i]])
    return pairs

In [181]:
pairs_of_sentences = read_pairs("downloads/en_1.txt", "downloads/pl_1.txt") + read_pairs("downloads/en_2.txt", "downloads/pl_2.txt")

In [182]:
def print_random_sample(pairs_of_sentences, n = 30):
    if len(pairs_of_sentences) > 0:
        for i in range(n):
            print(pairs_of_sentences[random.randint(0, len(pairs_of_sentences))])


In [183]:
def spaces_heuristic(line1, line2):
    if line1.count(' ') < 3 or line2.count(' ') < 3:
        return "", ""
    else:
        return line1, line2

def spaces_density_heuristic(line1, line2):
    if len(line1) == 0 or len(line2) == 0 or line1.count(' ') / len(line1) < 0.1 or line2.count(' ') / len(line2) < 0.1:
        return "", ""
    else:
        return line1, line2

def spaces_difference_heuristic(line1, line2):
    a = line1.count(' ')
    b = line2.count(' ')
    if a > b:
        a, b = b, a
    if (a == b and a == 0) or 2 * a < b:
        return "", ""
    else:
        return line1, line2

def spaces_difference_heuristic2(line1, line2):
    a = line1.count(' ')
    b = line2.count(' ')
    if a == b and a < 3:
        return "", ""
    else:
        return line1, line2

def web_pages_heuristic(line1, line2):
    if line1.count('http') > 0 or line2.count('http') > 0 or line1.count('www') > 0 or line2.count('www') > 0:
        return "", ""
    else:
        return line1, line2

def number_heuristic(line1, line2):
    def get_val(line):
        val = 0
        for i in range(10):
            val += line.count(str(i))
        return val
    if get_val(line1) * 10 > len(line1) or get_val(line2) * 10 > len(line2):
        return "", ""
    else:
        return line1, line2

def same_in_both_languages_heuristic(line1, line2):
    if line1 == line2:
        return "", ""
    else:
        return line1, line2

def special_words_heuristic(line1, line2):
    def check(phrase):
        f = 1
        for letter in phrase:
            if not letter.isupper():
                f = 0
                break
        if f == 1:
            if line1.count(phrase) > 0 and line2.count(phrase) > 0:
                return True
        return False
        
    for i in range(len(line1) - 1):
        if check(line1[i] + line1[i + 1]):
            return "", ""
    for i in range(len(line2) - 1):
        if check(line2[i] + line2[i + 1]):
            return "", ""
    return line1, line2
            
# we don't include special_words_heuristic, since it doesn't perform well
all_heuristics = [same_in_both_languages_heuristic, number_heuristic, spaces_density_heuristic, spaces_difference_heuristic, spaces_difference_heuristic2, spaces_heuristic, web_pages_heuristic]


In [184]:
def test_heuristic(heuristic, pairs_of_sentences):
    pairs_of_sentences2 = []
    removed_sentences = []
    for pair in pairs_of_sentences:
        result = heuristic(pair[0], pair[1])
        if result[0] != "":
            pairs_of_sentences2.append(result)
        else:
            removed_sentences.append(pair)
    print_random_sample(removed_sentences)
    print("Removed " + str(100 * len(removed_sentences) / len(pairs_of_sentences)) + " percent of sentences")
    return pairs_of_sentences2


In [185]:
print_random_sample(pairs_of_sentences, n=1000)

['Web: http://www.maximlignano.com', 'Web: http://www.maximlignano.com']
['3. Location: Northern Europe, bordering the Baltic Sea, Gulf of Bothnia, and Gulf of Finland, between Sweden and Russia', '3. Położenie: Europa Północna, nad Morzem Bałtyckim, Zatoką Botnicką i Zatoką Fińską, pomiędzy Szwecją i Rosją']
['This far-distant nebula is visible to the naked eye, and when you view it, pause to consider that the light you behold left those distant suns almost one million years ago.', 'Tę odległą mgławicę można zobaczyć gołym okiem a gdy patrzycie na nią, pomyślcie o tym, że światło, które dostrzegacie, opuściło owe odległe słońca prawie milion lat temu.']
['You can end up in a crisis and feel that you do not dare, that you think you will fail, that it is difficult, when you have low self-esteem.', 'Może skończyć się w kryzysie i czują, że nie mają odwagi, że myślisz, że nie, że to jest trudne, gdy masz niskie poczucie własnej wartości.']
['In that Day of trial they will have much more d

In [186]:
_ = test_heuristic(special_words_heuristic, pairs_of_sentences[:int(len(pairs_of_sentences) / 100)])

['"On the issue of EU Passenger Name Records (PNR), the S&D Group has achieved some very important results which will be the cornerstones for the next steps of our work with EU governments and the Commission. Indeed, for the first time a large majority of MEPs - including from the EPP - have acknowledged that the current draft EU PNR proposal needs to be revised to comply with the ECJ judgment on the Data Retention Directive.', '- W kwestii unijnego wykazu danych pasażerów (UE PNR) Grupa S&D osiągnęła kilka bardzo ważnych wyników, które będą fundamentem dla kolejnych etapów naszej współpracy z rządami UE i Komisją. Po raz pierwszy duża większość eurodeputowanych - w tym z EPL - uznała, że obecny projekt wniosku w sprawie UE PNR musi zostać zmieniony w celu dostosowania się do orzeczenia ETS w sprawie dyrektywy w sprawie zatrzymywania danych.']
['During World War II, shaped charge liners were made of copper or steel, though other materials were tried or researched.', 'Podczas II wojny ś

In [187]:
_ = test_heuristic(same_in_both_languages_heuristic, pairs_of_sentences)

['4/0', '4/0']
["would like to give me a strength to do this, I'm willing to pay if necessary.", "would like to give me a strength to do this, I'm willing to pay if necessary."]
['4. Escape', '4. Escape']
['locale(1), localedef(1), isalpha(3), localeconv(3), nl_langinfo(3), rpmatch(3), strcoll(3), strftime(3), charsets(7), locale(7)', 'locale(1), localedef(1), isalpha(3), localeconv(3), nl_langinfo(3), rpmatch(3), strcoll(3), strftime(3), charsets(7), locale(7)']
['Felicidades.', 'Felicidades.']
['BSD (3)', 'BSD (3)']
["We don't make the rules, dear.", "We don't make the rules, dear."]
['Strona : http://www.esport.gg', 'Strona : http://www.esport.gg']
['Thank you for your interest.', 'Thank you for your interest.']
['Socialists and Democrats in the European Parliament and the Committee of the Regions join forces, calling for effective integration of Roma people', 'Socialists and Democrats in the European Parliament and the Committee of the Regions join forces, calling for effective int

In [188]:
_ = test_heuristic(number_heuristic, pairs_of_sentences)

['00:49', '21:00']
['[Íà ñòðàíèöó: 1, 2, 3...5, 6, 7, 8, 9, 10] Ðàìáîâ 07 àïð. 2016 17:41', '[Íà ñòðàíèöó: 1, 2, 3...5, 6, 7, 8, 9, 10] Ðàìáîâ 07 àïð. 2016 17:41']
['Reception open 10:00 - 24:00', 'Recepcja otwarta 10:00 - 24:00']
['15:00', '22:13']
['up to 70 kg possible.', 'Standardowo 70 kg.']
['... 2005.', '2.']
['Available in 10, 20, 30, 40 volumes.', 'Dostępny w 10, 20, 30, 40 tomów.']
['23:00', '25:00']
['9/10!', '7/10 Dobry.']
['8 1.', '- â 1.']
['http://free4web.pl/3/2,52252,79085,6467507,Thread.html', 'http://free4web.pl/3/2,52252,79087,4086036,84,Thread.html']
['100Gb Ethernet (1)', 'SO-DIMM (1)']
['Strona: (8) :: << < Wstecz 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 Dalej > >> Do góry', 'Strona: (13) :: << < Wstecz 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 Do góry']
['SECTION 1', 'Część 1']
['33:15', '15:00']
['Number of rooms: 90.', 'Liczba pokoi: 90 .']
['Accessories(2)', 'Sound(2) Kits(2)']
['9/2007', '9/2007']
['Mainz (1)', 'Moguncja (1)']
['60:15', '10:15']
['4 3', '3.']
['110 000

In [189]:
_ = test_heuristic(spaces_heuristic, pairs_of_sentences)

['one more time', 'one more time']
['Russia, Documentary', 'Rosja, Muzyka']
['http://hotel.senimo.cz', 'http://web.iol.cz/dspomnenka/dspomnenka']
['Perfect.', 'Znakomicie.']
['Conmutadores (2)', 'Przełączniki (2)']
['USA, Augusta', 'Stany Zjednoczone Ameryki, Junction City']
['Energy usage:', 'Zużycie energii']
['5 October 2007', '5 października 2007']
['www: http://www.aquaparkplovarnahranice.cz', 'www: http://www.aquaparkplovarnahranice.cz']
['http://2009.awmet.pl/', 'http://www.pcs.waw.pl/']
['It is raining.', 'Het is heet.']
['September 2013(1)', 'October 2013(3)']
['3.', '- 3 season-themed episodes;']
['All these contribute.', 'Wszystkie te przyczyniają.']
['operation is invalid.', 'operation jest niepoprawne.']
['Rumcajs.', 'Rumcajs.']
['12:30', '30:00']
['Finland (2)', 'Russia (2)']
['back to the overview', 'powrót do przeglądu']
['The hostel is near the famous [...]', 'Ten [...]']
['Challenges Ahead', 'Wyzwania przed nami']
['Maximum number of results:', 'Maksymalna liczba wyni

In [190]:
_ = test_heuristic(spaces_density_heuristic, pairs_of_sentences)

['Content:', '4 Punkty:']
['Slendertone (4)', 'Slendertone (4)']
['http://www.hotelsegypt.net/', 'http://www.ilovesushi.hu/ http://sushi.extra.hu/sushi-rendeles.html']
['(Pic. 4).', '2015(4)']
['International law must ensure that the law of the more powerful does not prevail.', 'Prawodawstwo międzynarodowe winno unikać sytuacji, w których przeważałoby prawo silniejszego.']
['http://www.ubucentrum.net/2011/10/korzystasz-juz-z-ubuntu-1110.html', 'http://psychika.net/ STRONA GŁÓWNA']
['USA, Alum Rock', 'Stany Zjednoczone Ameryki, Chicago']
['Chonchi +2°C', 'Newton +20°C']
['(Continue to pattern...)', '(Patrz objasnienia...)']
['Finally, the text is suitable as the basis of a comprehension exercise, too. Possible questions are:', 'Przykładowe pytania na podsumowanie tekstu:']
['(111)', 'TIM(1)']
['The total cost of the investment amounts to €500 million.', 'do produkcji polipropylenu.']
['Statistical offices', 'Urzędy statystyczne']
['http://www.sportsforher.com/', 'http://www.jardb.pl/']


In [191]:
_ = test_heuristic(spaces_difference_heuristic, pairs_of_sentences)

['http://www.pjuma-events.de', 'http://www.pjuma-events.de']
['Try again later. )".', ')".']
['Besides, they are also the major modifiable factors.', 'Pomimo faktu, że otyłość nie jest tylko wynikiem nadmiernej pobłażliwości przy wyborze smacznej żywności lub braku aktywności fizycznej, czynniki żywieniowe oraz aktywność fizyczna mają duży wpływ na równowagę energetyczną.']
['Subject:', 'wkozon 2011-12-06 14:57:11']
['protection,', 'ochrony środowiska,']
['http://www.bison-fute.equipement.gouv.fr', 'http://www.bison-fute.equipement.gouv.fr']
['But find suitable options for furniture layout - it is not an easy task. Previously, this required to spend a lot of time to travel around all the stores and choose the appropriate option for themselves.', 'Ale znaleźć odpowiednie opcje układu mebli - nie jest to łatwe zadanie.']
['People of the Institute', 'Ludzie instytutu']
['In action, it is Righteous Action. Same love, as a feeling, is peace. Be and (¿) love as an understanding is non-violen

In [192]:
_ = test_heuristic(spaces_difference_heuristic2, pairs_of_sentences)

['More (4)', 'Więcej (112)']
['Jonathan, 45', 'Jonathan, 45.']
['25/09/2011', '26/02/2010']
['http://www.filmyswiata.pl', 'http://3.bp.blogspot.com/.../Why-you-no.jpg']
[':', '16:14:59']
['• 6.2.', '• 6.2.']
['3:13', '3:17']
['WRAL.', 'ConsumerReports.']
['Games .', 'Gry .']
[':', 'Shaviv:']
['Good to go?', 'Good to go?']
['11:00', '11:32']
['http://havirna.eu, http://www.stripky.cz/havirna', 'http://havirna.eu, http://www.stripky.cz/havirna']
['Hamid.', 'Hamid.']
['Ari, 28', 'Ari, 28.']
['2/11', '2/11']
['daily, 17:00', 'codziennie, 17:00']
['(En lækkerbisken.', '(En lækkerbisken.']
['In English', 'Po polsku']
['http://supertak7.canariblogs.com', 'http://aditus.waw.pl/kcb-xauq9.php']
['40 minutes', '40 minut']
['13:15 - 14:00', '13:15 - 14:00']
['1. Habit', '1. Habit']
['10:35', '10:26']
['More (24)', 'Więcej (68)']
['http://free4web.pl/3/2,52252,79088,3173351,Thread.html', 'http://free4web.pl/3/2,52252,79089,2179945,Thread.html']
['Personally:', 'Osobiście:']
['Priming(4)', 'Mieszani

In [193]:
_ = test_heuristic(web_pages_heuristic, pairs_of_sentences)

['Web: http://www.hotelcesareaugusto.it', 'Strona web: http://www.hotelcesareaugusto.it']
['http://www.touchstonegallery.com/', 'http://www.galerianorland.ant.pl/']
['European Institute for Climate and Energy http://www.eike-klima-energie.eu/', 'http://www.skepticalscience.com/hansen-and-sato-2012-climate-sensitivity.html']
['http://www.webcammax.com', 'http://www.bearware.dk']
['http://www.rogervivierjapan.com', 'http://www.helpnowvt.org/japan/bags-online-4140.ht...']
['Web http://www.ubytovaniratiskovice.cz', 'Web http://www.ubytovaniratiskovice.cz']
['http://free4web.pl/3/2,52252,79088,3173351,Thread.html', 'http://free4web.pl/3/2,52252,79088,4406749,0,Thread.html']
['For more information visit: http://eddiemartin.com/', 'Więcej informacji o artyście: http://eddiemartin.com/']
['http://teatrkto.pl/en.html', 'http://teatrkto.pl']
['Web: http://www.ainova.sk/', 'www: http://www.ainova.sk/']
['http://odswiezamy.xaa.pl/', 'http://odswiezamy.xaa.pl/']
['http://www.bucher-gruppe.de', 'htt

In [194]:
def get_better_dataset(pairs_of_sentences, heuristics):
    new_dataset = []
    removed = []
    for pair in pairs_of_sentences:
        f = 0
        for heur in heuristics:
            if heur(pair[0], pair[1])[0] == "":
                f = 1
                break
        if f == 0:
            new_dataset.append(pair)
        else:
            removed.append(pair)
    return new_dataset, removed

In [195]:
better_pairs_of_sentences, removed_pairs_of_sentences = get_better_dataset(pairs_of_sentences, all_heuristics)

In [196]:
print_random_sample(pairs_of_sentences, n = 1000)

['A president of a country should be the first person to respect an independent judiciary, and not the last like this case seems to demonstrate."', 'Prezydent kraju powinien być pierwszą, a nie ostatnią osobą przestrzegającą niezawisłości sądów, czego zdaje się dowodzić ten przypadek.']
['This is shown all through her life.', 'Można to zobaczyć w całym jej życiu.']
['Here, there is good water and a good climate."', 'Tu my prosto, rzucone (A. R., w. 19).']
[': :', '14:59:19']
['13:00', '19:00']
['Based on 9 ratings', 'na podstawie 31 ocen']
['The Danfoss company participates in solving this problem.', 'Firma Danfoss bierze czynny udział w rozwiązaniu tego problemu.']
['One knows his routes and times.', 'Zna jego trasy i czasy.']
['It is also a designated sustainable tourism area, with tourism activities focused upon preserving the beautiful natural and historical environment and promoting socio-economic equality.', 'Jest również wyznaczony obszar zrównoważonego turystyki, z turystyki, s

In [197]:
print_random_sample(better_pairs_of_sentences, n = 1000)

['Support independent life for people with multiple sclerosis', 'Wsparcie niezależnego życia dla osób ze stwardnieniem rozsianym']
['It´s advised that even children accompanied by parents to have their documentation to avoid problems during the hosting process;', "It's poinformowała, że nawet dzieci w towarzystwie rodziców, aby ich dokumentację, aby uniknąć problemów w czasie prowadzenia procesu;"]
['Long oil will not degenerate.', 'Olej nie długo pogorszenie.']
['The city is a large center of fishery.', 'Miasto jest dużym ośrodkiem rybołówstwa.']
['You may have found yourself here reading this because maybe you are going through a really challenging time in your life! Maybe you found my blog because you’ve searched on one of the many search engines these exact words ” I have lost everything “.', 'Może znalazłeś się tutaj, czytając to, bo może jesteś przeżywa bardzo trudnym okresie w swoim życiu! Być może, że mój blog, ponieważ szukałeś na jednej z wielu wyszukiwarek te dokładne słowa 

In [198]:
print_random_sample(removed_pairs_of_sentences)

['Results that matter', 'Wyniki, które mają sens']
['SaÅ atka idealnie (...)', 'Do garnka (...)']
['The Pillars of the Earth', 'The Pillars of the Earth']
['And fundamental.', 'Mam prawo naturalne.']
['4:44', '44:20']
['http://imagimation.net', 'http://www.equistaff.com']
['4%', '1%']
['Which one is best to use in each situation? 2015/07/22', 'Co powinienem zrobić? 2015/04/22']
['authorities of participating Member States to obtain information', 'Władze uczestniczących państw członkowskich w celu uzyskania informacji']
['1. Hahahaczyk', '1. Moje serce']
['Elektrisola typical value 40% 40% 40% 40% 40% 40% 40% 40% 40%', 'Elektrisola typowe parametry 40% 40% 40% 40% 40% 40% 40% 40% 40%']
['13:42:55', '12:30:50']
['He is just like his peers.', 'Nadal jest rehabilitowany.']
['Malta (2)', 'Cord (2)']
['April 2009(2)', 'July 2009(7)']
['Further information also at: http://www.jedli.zabrezsko.cz', 'Więcej informacji również na: http://www.jedli.zabrezsko.cz']
['Crucial (5)', 'SO-DIMM (4)']
['Y

In [199]:
print("Removed", 100 * len(removed_pairs_of_sentences) / (len(pairs_of_sentences)), "percent of dataset")

Removed 30.440162990799614 percent of dataset


In [200]:
pairs_of_sentences = better_pairs_of_sentences

In [201]:
random.shuffle(pairs_of_sentences)

In [202]:
test_size = math.floor(ratio_test * len(pairs_of_sentences))
val_size = math.floor(ratio_test * len(pairs_of_sentences))
train_size = len(pairs_of_sentences) - test_size - val_size

In [203]:
def write_pairs(pairs, suffix):
    src = open("src-" + suffix + ".txt", "w")
    tgt = open("tgt-" + suffix + ".txt", "w")
    for [first, second] in pairs:
        src.write(first + "\n")
        tgt.write(second + "\n")
    src.close()
    tgt.close()

In [204]:
write_pairs(pairs_of_sentences[0 : test_size], "test")
write_pairs(pairs_of_sentences[test_size : test_size + val_size], "val")
write_pairs(pairs_of_sentences[test_size + val_size : test_size + val_size + int(train_size / 2)], "train1")
write_pairs(pairs_of_sentences[test_size + val_size + int(train_size / 2): test_size + val_size + train_size], "train2")