The purpose of this notebook is to preprocess the corpus.

What we want to do:
* Transliterate cyrillic into latin
* Remove punctuation (`,)(.:;` plus exotic quotation marks)

In [1]:
import os
import logging

logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)


raw_dir = "/home/peterr/macocu/taskB/data/raw"
interim_dir = "/home/peterr/macocu/taskB/data/interim"
final_dir = "/home/peterr/macocu/taskB/data/final"


files = ["bswac","cnrwac","hrwac","srwac"]

## Creating a toy dataset for testing purposes:

In [2]:
%%bash
for file in {bswac,cnrwac,hrwac,srwac}
do
head -n 1000000 "/home/peterr/macocu/taskB/data/raw/$file" > "/home/peterr/macocu/taskB/data/interim/$file"_head
done


In [3]:
cnr_path = os.path.join(interim_dir, "cnrwac_head")
with open(cnr_path, "r") as f:
    chars = set(f.read())

In [4]:
# chars was inspected and a collection of unwanted characters was chosen.
# See chars_to_remove

In [6]:
chars_to_remove = {
    '!',
    '"',
    '#',
    '%',
    '&',
    "'",
    '(',
    ')',
    '*',
    '+',
    ',',
    '-',
    '.',
    '/',
    ':',
    ';',
    '<',
    '=',
    '>',
    '?',
    '[',
    ']',
    '_',
    '`',
    '«',
    '°',
    '²',
    '³',
    'µ',
    '·',
    '»',
    '½',
    '‑',
    '–',
    '‘',
    '’',
    '“',
    '”',
    '„',
    '•',
    '…',
    '‰',
    '″',
    '₂',
    '₃',
    '€',
    '™',
    '→',
    '−',
    '∕',
    '😀',
    '😉',
    '🙁',
    '🙂'

}


from typing import Set
def remove_chars(input_text: str, chars_to_remove: Set[str]) -> str:
    for c in chars_to_remove:
        input_text = input_text.replace(c, "")
    return input_text

def transliterate(input_text: str) -> str:
    from transliterate import translit
    return translit(input_text, "sr", reversed=True)

# Sort out the subsets:
for file in files:
    file = file + "_head"
    logging.info(f"Started preprocessing file {file}.")
    input_path = os.path.join(interim_dir, file)
    with open(input_path, "r") as f:
        text = f.read()
    text = remove_chars(text, chars_to_remove)
    text = transliterate(text)
    output_path = os.path.join(interim_dir, file+"_pp")
    with open(output_path, "w") as f:
        f.write(text)


# Sort out the whole files:
for file in files:
    logging.info(f"Started preprocessing file {file}.")
    input_path = os.path.join(raw_dir, file)
    with open(input_path, "r") as f:
        text = f.read()
    text = remove_chars(text, chars_to_remove)
    text = transliterate(text)
    output_path = os.path.join(interim_dir, file+"_pp")
    with open(output_path, "w") as f:
        f.write(text)

2021-12-29 16:08:34,656 - Started preprocessing file bswac_head.
2021-12-29 16:08:56,024 - Started preprocessing file cnrwac_head.
2021-12-29 16:10:11,861 - Started preprocessing file hrwac_head.
2021-12-29 16:10:32,148 - Started preprocessing file srwac_head.
2021-12-29 16:10:52,992 - Started preprocessing file bswac.
2021-12-29 16:15:34,130 - Started preprocessing file cnrwac.
2021-12-29 16:18:21,212 - Started preprocessing file hrwac.
