# Pre-processing pipeline for UN Parallel Corpus

In [1]:
!git clone https://github.com/AlexSkrn/diplom.git

Cloning into 'diplom'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 3), reused 17 (delta 2), pack-reused 0[K
Unpacking objects: 100% (18/18), 863.80 KiB | 3.75 MiB/s, done.


In [2]:
# combine 3 source files into a single tab-delimited file
!paste diplom/data/10K_en.txt \
       diplom/data/10K_ru.txt \
       diplom/data/10K_ids.txt \
       > diplom/data/combined.txt

In [3]:
!head -2 diplom/data/combined.txt

UNITED NATIONS	ОБЪЕДИНЕННЫХ НАЦИЙ	1990/trans/wp_29/1999/14/add_1 en:1:1 en:2:1 ru:2:1
E	E	1990/trans/wp_29/1999/14/add_1 en:3:1 ru:3:1


In [4]:
!wc -l diplom/data/combined.txt

10000 diplom/data/combined.txt


In [5]:
# use awk to delete duplicates, w/o changing the order of elements
# it uses the first two columns to filter out duplicates
!awk -F"\t" '!seen[$1, $2]++' \
    diplom/data/combined.txt \
    > diplom/data/combined2.txt

!wc -l diplom/data/combined2.txt

9495 diplom/data/combined2.txt


In [6]:
# strip some numerals from the right-hand side of each line
# and then run duplicate removal code again
from diplom.rstrip_numerals import main

src_file = 'diplom/data/combined2.txt'
trg_file = 'diplom/data/combined3.txt'

main(src_file, trg_file)

!awk -F"\t" '!seen[$1, $2]++' \
    diplom/data/combined3.txt \
    > diplom/data/uniq.txt

!wc -l diplom/data/uniq.txt

Read 9495 lines in diplom/data/combined2.txt
Wrote processed lines to diplom/data/combined3.txt
8853 diplom/data/uniq.txt


In [7]:
# run preprocessing steps:
#   remove short sentences, numericals only and punctuation only
from diplom.preprocess import main

src_file = 'diplom/data/uniq.txt'
line_nums_file = 'diplom/data/preproc_linenums.txt'
trg_file = 'diplom/data/preproc.txt'

main(src_file, line_nums_file)

!awk 'NR == FNR {pos[$1]; next} FNR in pos' \
    diplom/data/preproc_linenums.txt \
    diplom/data/uniq.txt \
    > diplom/data/preproc.txt

!wc -l "$trg_file"

Read 8853 lines in file diplom/data/uniq.txt
Found 128 to be removed.
Indices (starting with 1) of lines to be kept are written to diplom/data/preproc_linenums.txt
8725 diplom/data/preproc.txt


In [8]:
# remove near duplicate pairs of sentences
from diplom.jaro_pairs_sumbyte import main

source_path = 'diplom/data/preproc.txt'
path_for_jaro = 'diplom/data/jaro_pair_scores.txt'
path_good_numbers = 'diplom/data/jaro_good_nums.txt'
path_bad_numbers = 'diplom/data/jaro_bad_nums.txt'

main(source_path, path_for_jaro, path_good_numbers, path_bad_numbers)

trg_file = 'diplom/data/jaro_pairs.txt'

!awk 'NR == FNR {pos[$1]; next} FNR in pos' \
    'diplom/data/jaro_good_nums.txt' \
    'diplom/data/preproc.txt' \
    > 'diplom/data/jaro_pairs.txt'

!wc -l "$trg_file"

Read diplom/data/preproc.txt
Wrote 8725 jaro scores to diplom/data/jaro_pair_scores.txt
Found 67 sentence pairs to be removed.
Indices (starting with 1) of lines to be kept are written to diplom/data/jaro_good_nums.txt
Use: awk 'NR == FNR {pos[$1]; next} FNR in pos' linenumbers sourcefile > targetfile.
Indices (starting with 1) of lines to be removed are written to diplom/data/jaro_bad_nums.txt
Use: awk 'NR == FNR {pos[$1]; next} FNR in pos' linenumbers sourcefile > targetfile.
8658 diplom/data/jaro_pairs.txt


In [9]:
# remove near duplicate lines
from diplom.jaro_lines_sumbytes import main

source_path = 'diplom/data/jaro_pairs.txt'
path_for_near_duplicates = 'diplom/data/see_sents_to_del.txt'
path_for_del_numbers = 'diplom/data/sentences_to_del_numbers.txt'
path_for_del_dict = 'diplom/data/sentences_to_del_numbers_dict.txt'

test = False

main(
    source_path,
     path_for_near_duplicates,
     path_for_del_numbers,
     path_for_del_dict,
     test
     )

from diplom.filter_by_linenumber import filter_by_linenumber

source_path = 'diplom/data/jaro_pairs.txt'
target_path = 'diplom/data/jaro_lines.txt'

filter_by_linenumber(path_for_del_numbers, source_path, target_path)

------ Iteration 0 ---------
Sums counter: 5000
Shape of abs diff array: (5000, 5000)
Number of keys in sim dict: 1820
# of sent line numbers in sim dict (keys+vals): 2897
Dict of string sentences contains 2897 concatenated sents
------ Iteration 1 ---------
Sums counter: 3658
Shape of abs diff array: (3658, 3658)
Number of keys in sim dict: 742
# of sent line numbers in sim dict (keys+vals): 1306
Dict of string sentences contains 1306 concatenated sents
------ Iteration 2 ---------
Number of sentences to delete: 179
Wrote 179 numbers of lines to be deleted to diplom/data/sentences_to_del_numbers.txt
Use filter_by_linenumber.py


In [10]:
# this is the end of pre-processing
!wc -l diplom/data/jaro_lines.txt

8479 diplom/data/jaro_lines.txt


In [11]:
# additional step, not related to preprocessing
# it is splitting single file into multiple files
# it is intended for use within dtSearch software
# you need to create a target folder first
from diplom.tsv import main

!mkdir diplom/data/tsv_ru

tsv_folder = 'diplom/data/tsv_ru'  # create this first
file_name = 'diplom/data/jaro_lines.txt'
data_folder = ''

main(file_name, data_folder, tsv_folder)

Dictionary length is 68
Total lines read from original files: 8479
Total lines written to all files: 8479


In [12]:
!zip -qq -r diplom/data/en_ru.zip diplom/data/tsv_ru

In [13]:
from google.colab import files

files.download('diplom/data/en_ru.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>