In [None]:
import pandas as pd
import csv
import threading

In [None]:
"""Within our project proposal we also linked sentence pairs from Wikipedia and GVP Corpus, but ofter reviewing the
pairs from both of these sites, we found many inconsistencies between the translations regarding numbers, punctuation
and structure. For these reasons we decided to use only the Tatoeba sentence pairs

The Tatoeba site's translations are organized in a round-about way. There are three files needed: English sentences,
Hebrew sentences, and links. Each sentense has a unique ID. The links file contains over 18 million ID pairs that
represents those sentences are translations of themselves.

In order to get just the English and Hebrew sentences we first looped through all 18 million ID pairs and looked 
within the English sentence Data Frame (df) and Hebrew sentence df for the presence of the current IDs [1]. We used a
threading library because the processing of each row takes ~.002 seconds; with 18 million rows, the processing takes
around 10hrs [2]. We left the process running overnight.
If both IDs are found, the IDs are put into a holder df and then saved after every 100000 link pairs [3].
Next we concatinated all df chunks together into a single df and file [4].
Finally we built another df and file that contained the English/Hebrew texts instead of their respective IDs [5].
Additionally, we stripped the sentences of punctuations (.?,!:;) because while they are used the same in Hebrew,
we felt it would be easier to translate without them [6].
(Numbers in brackets are to show where which step is within the code.)

Ultimately we have 126,549 English-Hebrew sentence pairs. In the future we may decide to tokenize each sentence and 
build vocabularies; but for the time being the sentence pairs will remain together.

"""

In [None]:
def process_chunk(chunk, chunk_num):
    added_count = 0
    links = pd.DataFrame(columns = ['SENTENCE_ID','TRANSLATION_ID'])
    for i,l in chunk.iterrows():
        print(f'chunk {chunk_num} index {i} added {added_count}')
        l1 = eng.loc[eng['SENTENCE_ID'] == l['SENTENCE_ID']]#[1]
        l2 = heb.loc[heb['SENTENCE_ID'] == l['TRANSLATION_ID']]
        if not l1.empty and not l2.empty:#[3]
            id1 = l1.iloc[0]['SENTENCE_ID']
            id2 = l2.iloc[0]['SENTENCE_ID']
            links=links.append(pd.DataFrame({'SENTENCE_ID':[id1],'TRANSLATION_ID':[id2]}),ignore_index=True)
            #print(len(links.index))
            added_count+=1
    links.to_csv(f'needed_links_{chunk_num}.csv',sep='\t')
    

eng = pd.read_csv("eng_sentences.tsv", sep="\t", names=["SENTENCE_ID","LANGUAGE","TEXT"])
heb = pd.read_csv("heb_sentences.tsv", sep="\t", names=["SENTENCE_ID","LANGUAGE","TEXT"])
link_chunks = pd.read_csv("links.csv", sep="\t", names=["SENTENCE_ID","TRANSLATION_ID"], chunksize=100000)
for i, chunk in enumerate(link_chunks):
    threading.Thread(target=process_chunk,args=(chunk,i)).start()#[2]

In [None]:
full_links = pd.DataFrame(columns=['SENTENCE_ID','TRANSLATION_ID'])
for i in range(183):#[4]
    print(f'file number {i} total links {len(full_links.index)}')
    chunk = pd.read_csv(f'needed_links_{i}.csv',sep='\t')
    for j,l in chunk.iterrows():
        #print(l)
        full_links=full_links.append(pd.DataFrame({'SENTENCE_ID':[l['SENTENCEE_ID']],'TRANSLATION_ID':[l['TRANSLATION_ID']]}),ignore_index=True)
full_links.to_csv(f'needed_links_full.csv',sep='\t')

In [None]:
ids = []
translations = pd.DataFrame(columns=['ENGLISH','HEBREW'])
for i,l in full_links.iterrows():#[5]
    if i%1000==0:
        print(f'index {i}')
    l1 = eng.loc[eng['SENTENCE_ID'] == l['SENTENCE_ID']]
    l2 = heb.loc[heb['SENTENCE_ID'] == l['TRANSLATION_ID']]
    if not l1.empty and not l2.empty:
        id1 = l1.iloc[0]['SENTENCE_ID']
        id2 = l2.iloc[0]['SENTENCE_ID']
        if not id1 in ids and not id2 in ids:
            ids.append(id1)
            ids.append(id2)
            t1 = ''.join(p for p in l1.iloc[0]['TEXT'] if p not in ['.','?',',','!',':',';'])#[6]
            t2 = ''.join(p for p in l2.iloc[0]['TEXT'] if p not in ['.','?',',','!',':',';'])
            translations = translations.append(pd.DataFrame({'ENGLISH':[t1],'HEBREW':[t2]}),ignore_index=True)
translations.to_csv('translations.csv',sep='\t')