In [1]:
import os
import shutil
import re
import glob
import errno
import codecs
import chardet
from collections import Counter
from dz07lib import create_or_check_path, detect_encoding, read_and_modify_vocab, reverse_dict
import numpy as np

In [2]:
raw_data_dir = "data/raw_eng/"
clean_data_dir = "data/clean_eng/"
vocab_output_filepath = "data/vocab.txt"
total_clean_text_filepath = "data/total_clean_text.txt"
total_text_wordnum_filepath = "data/total_clean_text.wordnum"
total_wordnums_npy_filepath = "data/total_wordnums"

VOCAB_WORDS_COUNT = 35000

In [3]:
raw_data_filepaths = glob.glob(raw_data_dir + "//*.*")
len(raw_data_filepaths)

1

In [4]:
def remove_empty_lines(filename):
    """Overwrite the file, removing empty lines and lines that contain only whitespace."""
    with open(filename) as in_file, open(filename, 'r+') as out_file:
        out_file.writelines(line for line in in_file if line.strip())
        out_file.truncate()

In [5]:
create_or_check_path(clean_data_dir)
for path in raw_data_filepaths:
    try:
        with open(path, "rt") as file:
#             clean_text = re.sub(r" +", " ", " ".join(re.findall(r'([а-яА-ЯёЁ ]+)', file.read()))).lower()
            clean_text = re.sub(r" +", " ", " ".join(re.findall(r'([a-zA-Z ]+)', file.read()))).lower()
#             clean_text = re.sub(r'^\s+$|\n', '\n', clean_text)
#             clean_text = re.sub(r'\n[\n]+', '\n', clean_text)
        with open(os.path.join(clean_data_dir, os.path.basename(path) + ".txt"), "wt") as file:
            file.write(clean_text)
        remove_empty_lines(os.path.join(clean_data_dir, os.path.basename(path) + ".txt"))
    except Exception as e:
        print(path, e)

In [6]:
clean_data_filepaths = glob.glob(clean_data_dir + "//*.*")
len(clean_data_filepaths)

1

In [7]:
word_counter = Counter()

with open(total_clean_text_filepath, "wt") as file:
    # clean file
    pass

for path in clean_data_filepaths:
    with open(path, "rt") as file:
        text = file.read()
        with open(total_clean_text_filepath, "at") as output_file:
            output_file.write(text)
            output_file.write("\n")
        word_counter.update(text.split())

In [8]:
len(word_counter)

47689

In [9]:
most_common_words = [pair[0] for pair in word_counter.most_common(VOCAB_WORDS_COUNT)]
# reserved_words_list = ["reserved0", "<S>", "<UNK>", "</S>", "reserved4"]
# final_vocab_list = reserved_words_list.copy()
# final_vocab_list.extend(most_common_words)
# final_vocab_list[:10]

In [10]:
with open(vocab_output_filepath, "wt") as output_file:
#     output_file.write("\n".join(final_vocab_list))
    output_file.write("\n".join(most_common_words))

In [11]:
word_to_idx_vocab = read_and_modify_vocab(vocab_output_filepath)
word_to_idx_vocab

{'the': 5,
 'to': 6,
 'a': 7,
 'and': 8,
 'of': 9,
 'in': 10,
 's': 11,
 'that': 12,
 'for': 13,
 'on': 14,
 'is': 15,
 'with': 16,
 'was': 17,
 'it': 18,
 'two': 19,
 'said': 20,
 'as': 21,
 'he': 22,
 'one': 23,
 'at': 24,
 'by': 25,
 'from': 26,
 'be': 27,
 'has': 28,
 'his': 29,
 'have': 30,
 'i': 31,
 'thousand': 32,
 'are': 33,
 'but': 34,
 'an': 35,
 'not': 36,
 'they': 37,
 'will': 38,
 'hundred': 39,
 'who': 40,
 'this': 41,
 'five': 42,
 'three': 43,
 'their': 44,
 'had': 45,
 't': 46,
 'were': 47,
 'u': 48,
 'four': 49,
 'more': 50,
 'been': 51,
 'which': 52,
 'point': 53,
 'or': 54,
 'twenty': 55,
 'year': 56,
 'its': 57,
 'would': 58,
 'new': 59,
 'after': 60,
 'about': 61,
 'up': 62,
 'six': 63,
 'we': 64,
 'p': 65,
 'seven': 66,
 'eight': 67,
 'c': 68,
 'than': 69,
 'when': 70,
 'you': 71,
 'also': 72,
 'dollars': 73,
 'out': 74,
 'nine': 75,
 'there': 76,
 'her': 77,
 'she': 78,
 'first': 79,
 'all': 80,
 'people': 81,
 'last': 82,
 'm': 83,
 'over': 84,
 'if': 85,
 'th

In [12]:
idx_to_word_vocab = reverse_dict(word_to_idx_vocab)
[idx_to_word_vocab.get(i) for i in range(10)]

['<filler>',
 '<S>',
 '<UNK>',
 '</S>',
 '<RESERVED>',
 'the',
 'to',
 'a',
 'and',
 'of']

In [13]:
cur_enc = detect_encoding(total_clean_text_filepath)
with open(total_clean_text_filepath, encoding=cur_enc, mode="rt") as input_file:
    with open(total_text_wordnum_filepath, mode="wt") as output_file:
        for line in input_file:
            word_nums = [str(word_to_idx_vocab.get(word, word_to_idx_vocab["<UNK>"])) for word in line.split()]
            if len(word_nums) != 0:
                output_file.write(" ".join(word_nums) + "\n")

In [14]:
with open(total_text_wordnum_filepath) as file:
    wornumds_split = list(map(int, file.readline().split()))

In [15]:
np.save(total_wordnums_npy_filepath, np.array(wornumds_split, dtype=np.int32))

In [16]:
# np.load(total_wordnums_npy_filepath+".npy")

In [17]:
wornumds_split

[13,
 23,
 725,
 5,
 308,
 6188,
 12,
 22,
 15486,
 17,
 6,
 7,
 4543,
 3215,
 5,
 728,
 9,
 7,
 7252,
 199,
 354,
 9528,
 8,
 28,
 98,
 4544,
 671,
 10,
 5,
 737,
 351,
 391,
 2157,
 25,
 3141,
 25566,
 25567,
 8,
 203,
 8818,
 45,
 7696,
 35,
 1869,
 101,
 4851,
 6,
 93,
 8219,
 71,
 112,
 30,
 6,
 331,
 6,
 6856,
 18,
 358,
 59,
 202,
 115,
 49,
 48,
 65,
 31,
 48,
 11,
 530,
 2461,
 62,
 287,
 243,
 14,
 1483,
 308,
 233,
 10,
 5,
 5937,
 9,
 5,
 1522,
 2124,
 246,
 68,
 68,
 31,
 270,
 46,
 244,
 18916,
 169,
 9,
 35,
 11661,
 10,
 936,
 9,
 526,
 31,
 299,
 7,
 609,
 9,
 81,
 113,
 6,
 2061,
 23,
 34,
 31,
 270,
 46,
 244,
 13237,
 4272,
 85,
 71,
 675,
 9529,
 8,
 6498,
 729,
 4159,
 190,
 3540,
 389,
 60,
 116,
 169,
 1450,
 135,
 105,
 69,
 71,
 47,
 10,
 19,
 32,
 49,
 70,
 474,
 47,
 345,
 1414,
 753,
 141,
 71,
 5462,
 1137,
 71,
 147,
 430,
 50,
 686,
 1138,
 33,
 13238,
 13,
 25568,
 1286,
 20,
 5,
 249,
 1037,
 271,
 243,
 43,
 324,
 30,
 51,
 2462,
 26,
 1245,
 783,
 10