In [4]:
import collections
import csv
import pandas as pd
from tqdm import tqdm
import re
import string

def get_word_pattern():
    kazakh_letters = 'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһ'
    english_letters = 'a-zA-Z'
    numbers = '0-9'
    other_symbols = '»…£€¥¢฿₸₽№°—'
    punctuation = re.escape(string.punctuation)
    
    valid_chars = f'[{kazakh_letters}{english_letters}{numbers}{other_symbols}{punctuation}]'
    return re.compile(f'{valid_chars}+')

def process_chunk(chunk, word_counts, word_pattern):
    for text in chunk.loc[chunk['contains_kaz_symbols'] == 1, 'text']:
        words = word_pattern.findall(str(text))
        word_counts.update(words)

def get_word_frequencies(in_file, chunk_size=1000):
    word_counts = collections.Counter()
    word_pattern = get_word_pattern()

    print(f'Processing {in_file} CSV file in chunks {chunk_size}...')
    for chunk in tqdm(pd.read_csv(in_file, chunksize=chunk_size, usecols=['text', 'contains_kaz_symbols'])):
        process_chunk(chunk, word_counts, word_pattern)

    return word_counts

if __name__ == '__main__':
    """
    Dataset Split	Domain	Number of texts in Split	Number of tokens in Split	Number of unique tokens in Split	Median number of tokens in text
cc100-monolingual-crawled-data	Wikipedia articles	19 635 580	441 623 321	6 217 337	12
kazakhBooks	Books	8 423	351 433 586	7 245 720	40 264
leipzig	Articles/News	1 706 485	26 494 864	1 109 113	14
oscar	CommonCrawl	269 047	230 314 378	3 863 498	431
kazakhNews	News	3 264 273	1 041 698 037	5 820 543	209
    """
    tokens_per_chunk = 50000000
    original_inputs = [
        ('leipzig', 12),
        ('oscar', 431),
        ('kazakhBooks', 40264),
        ('cc100-monolingual-crawled-data', 12),
        ('kazakhNews', 209),
    ]
    total_counter = collections.Counter()
    for in_file, median_tokens in original_inputs:
        chunk_size = tokens_per_chunk // median_tokens
        word_counts = get_word_frequencies(f'./corpus/{in_file}.csv', chunk_size=chunk_size)
        total_counter = total_counter + word_counts

        print('Writing to output file...')
        with open(f'./corpus_output/{in_file}_words.csv', 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile, delimiter='\t')
            writer.writerow(['word', 'count'])
            writer.writerows((word, count) for word, count in word_counts.most_common() if count >= 2)

    print('Writing to total output file...')
    with open(f'./corpus_output/total_words.csv', 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['word', 'count'])
        writer.writerows((word, count) for word, count in total_counter.most_common() if count >= 2)

Processing ./corpus/leipzig.csv CSV file in chunks 4166666...


1it [00:13, 13.12s/it]


Writing to output file...
Processing ./corpus/oscar.csv CSV file in chunks 116009...


3it [01:30, 30.13s/it]


Writing to output file...
Processing ./corpus/kazakhBooks.csv CSV file in chunks 1241...


7it [02:00, 17.28s/it]


Writing to output file...
Processing ./corpus/cc100-monolingual-crawled-data.csv CSV file in chunks 4166666...


5it [03:42, 44.48s/it]


Writing to output file...
Processing ./corpus/kazakhNews.csv CSV file in chunks 239234...


14it [02:37, 11.22s/it]


Writing to output file...
Writing to total output file...


In [6]:
import math
df = pd.read_csv('./corpus_output/total_words.csv', sep='\t')
df = df[df['count'] >= 2]
df['word_len'] = df['word'].apply(str).apply(len)
df = df[df['word_len'] < 26]

# write only words to txt file, several times: loge(count / 10)
with open('corpus_words.txt', 'w', encoding='utf-8') as f:
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        word = str(row['word'])
        for i in range(math.ceil(math.log(row['count'] / 10))):
            f.write(word)
            f.write('\n')

100%|██████████| 8842087/8842087 [04:37<00:00, 31821.01it/s]
