In [10]:
%%time

import collections
import csv
import pandas as pd
from tqdm import tqdm
import string

def get_all_symbols():
    numbers = set("0123456789")
    other_symbols = set(string.punctuation + "«»…£€¥¢฿₸₽№°—")
    space_symbol = {' '}
    kazakh_letters = set('АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһ')
    english_letters = set(string.ascii_letters)
    all_symbols = numbers | other_symbols | space_symbol | kazakh_letters | english_letters
    return all_symbols

def process_chunk(chunk, word_counts, all_symbols):
    for text in chunk.loc[chunk['contains_kaz_symbols'] == 1, 'text']:
        words = str(text).split()
        word_counts.update(word for word in words if set(word).issubset(all_symbols))

def get_word_frequencies(in_file, out_file, chunk_size=1000):
    word_counts = collections.Counter()
    all_symbols = get_all_symbols()

    print('Processing CSV file in chunks...')
    for chunk in tqdm(pd.read_csv(in_file, chunksize=chunk_size, usecols=['text', 'contains_kaz_symbols'])):
        process_chunk(chunk, word_counts, all_symbols)

    print('Writing to output file...')
    with open(out_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['word', 'count'])
        writer.writerows((word, count) for word, count in word_counts.most_common() if count >= 10)

if __name__ == '__main__':
    get_word_frequencies('./corpus/kazakhBooks.csv', 'output_word_frequencies.csv', chunk_size=1000)

Processing CSV file in chunks...


5it [03:30, 42.14s/it]


KeyboardInterrupt: 

In [11]:
import collections
import csv
import pandas as pd
from tqdm import tqdm
import re
import string

def get_word_pattern():
    kazakh_letters = 'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһ'
    english_letters = 'a-zA-Z'
    numbers = '0-9'
    other_symbols = '»…£€¥¢฿₸₽№°—'
    punctuation = re.escape(string.punctuation)
    
    valid_chars = f'[{kazakh_letters}{english_letters}{numbers}{other_symbols}{punctuation}]'
    return re.compile(f'{valid_chars}+')

def process_chunk(chunk, word_counts, word_pattern):
    for text in chunk.loc[chunk['contains_kaz_symbols'] == 1, 'text']:
        words = word_pattern.findall(str(text))
        word_counts.update(words)

def get_word_frequencies(in_file, out_file, chunk_size=1000):
    word_counts = collections.Counter()
    word_pattern = get_word_pattern()

    print('Processing CSV file in chunks...')
    for chunk in tqdm(pd.read_csv(in_file, chunksize=chunk_size, usecols=['text', 'contains_kaz_symbols'])):
        process_chunk(chunk, word_counts, word_pattern)

    print('Writing to output file...')
    with open(out_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['word', 'count'])
        writer.writerows((word, count) for word, count in word_counts.most_common(n=9) if count >= 10)

if __name__ == '__main__':
    get_word_frequencies('./corpus/kazakhBooks.csv', './corpus/corpus_words.csv', chunk_size=1000)

Processing CSV file in chunks...


9it [02:04, 13.85s/it]


Writing to output file...


In [20]:
import math
df = pd.read_csv('./corpus/corpus_words.csv', sep='\t')

# write only words to txt file, several times: loge(count / 10)
with open('corpus_words.txt', 'w', encoding='utf-8') as f:
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        word = str(row['word'])
        if len(word) > 25:
            continue
        for i in range(math.ceil(math.log(row['count'] / 10))):
            f.write(word)
            f.write('\n')

100%|██████████| 1009266/1009266 [00:34<00:00, 29360.98it/s]
