In [2]:
import collections
import csv
import pandas as pd
from tqdm import tqdm
import re
import string

def get_word_pattern():
    numbers = "0123456789"
    other_symbols = string.punctuation + "«»…№°—"
    space_symbol = ''
    kazakh_letters = 'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһ'
    english_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
    all_letters = kazakh_letters + english_letters
    all_symbols = numbers + other_symbols + space_symbol + all_letters
    all_symbols = re.escape(all_symbols)

    # before 8885443 words
    
    valid_chars = f'[{all_symbols}]'
    return re.compile(f'{valid_chars}+')

def process_chunk(chunk, word_counts, word_pattern):
    for text in chunk['text']:
        words = word_pattern.findall(str(text))
        word_counts.update(words)

def get_word_frequencies(in_file, chunk_size=1000):
    word_counts = collections.Counter()
    word_pattern = get_word_pattern()

    print(f'Processing {in_file} CSV file in chunks {chunk_size}...')
    for chunk in tqdm(pd.read_csv(in_file, chunksize=chunk_size, usecols=['text', 'contains_kaz_symbols'])):
        process_chunk(chunk, word_counts, word_pattern)

    return word_counts

if __name__ == '__main__':
    """
    Dataset Split	Domain	Number of texts in Split	Number of tokens in Split	Number of unique tokens in Split	Median number of tokens in text
cc100-monolingual-crawled-data	Wikipedia articles	19 635 580	441 623 321	6 217 337	12
kazakhBooks	Books	8 423	351 433 586	7 245 720	40 264
leipzig	Articles/News	1 706 485	26 494 864	1 109 113	14
oscar	CommonCrawl	269 047	230 314 378	3 863 498	431
kazakhNews	News	3 264 273	1 041 698 037	5 820 543	209
    """
    tokens_per_chunk = 50000000
    original_inputs = [
        ('leipzig', 12),
        ('oscar', 431),
        ('kazakhBooks', 40264),
        ('cc100-monolingual-crawled-data', 12),
        ('kazakhNews', 209),
    ]
    total_counter = collections.Counter()
    for in_file, median_tokens in original_inputs:
        chunk_size = tokens_per_chunk // median_tokens
        word_counts = get_word_frequencies(f'./corpus/{in_file}.csv', chunk_size=chunk_size)
        total_counter = total_counter + word_counts

        print('Writing to output file...')
        with open(f'./corpus_output/{in_file}_words.csv', 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile, delimiter='\t')
            writer.writerow(['word', 'count'])
            writer.writerows((word, count) for word, count in word_counts.most_common() if count >= 2)

    print('Writing to total output file...')
    with open(f'./corpus_output/total_words.csv', 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(['word', 'count'])
        writer.writerows((word, count) for word, count in total_counter.most_common() if count >= 2)

Processing ./corpus/leipzig.csv CSV file in chunks 4166666...


1it [00:15, 15.76s/it]


Writing to output file...
Processing ./corpus/oscar.csv CSV file in chunks 116009...


3it [02:18, 46.31s/it]


Writing to output file...
Processing ./corpus/kazakhBooks.csv CSV file in chunks 1241...


7it [03:39, 31.39s/it]


Writing to output file...
Processing ./corpus/cc100-monolingual-crawled-data.csv CSV file in chunks 4166666...


5it [04:34, 54.88s/it]


Writing to output file...
Processing ./corpus/kazakhNews.csv CSV file in chunks 239234...


14it [09:27, 40.51s/it]


Writing to output file...
Writing to total output file...


In [None]:
import math
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('./corpus_output/total_words.csv', sep='\t')
df = df[df['count'] >= 5]
df['word_len'] = df['word'].apply(str).apply(len)
df = df[df['word_len'] < 26]
df = df.drop(columns=['word_len'])


# Latin to Cyrillic character mapping (substituting similar-looking characters)
latin_to_cyrillic = {
    'a': 'а',  # Latin 'a' -> Cyrillic 'а'
    'A': 'А',  # Latin 'A' -> Cyrillic 'А'
    'e': 'е',  # Latin 'e' -> Cyrillic 'е'
    'E': 'Е',  # Latin 'E' -> Cyrillic 'Е'
    'i': 'і',  # Latin 'i' -> Cyrillic 'і'
    'I': 'І',  # Latin 'I' -> Cyrillic 'І'
    'o': 'о',  # Latin 'o' -> Cyrillic 'о'
    'O': 'О',  # Latin 'O' -> Cyrillic 'О'
    'p': 'р',  # Latin 'p' -> Cyrillic 'р'
    'P': 'Р',  # Latin 'P' -> Cyrillic 'Р'
    'c': 'с',  # Latin 'c' -> Cyrillic 'с'
    'C': 'С',  # Latin 'C' -> Cyrillic 'С'
    'y': 'у',  # Latin 'y' -> Cyrillic 'у'
    'Y': 'У',  # Latin 'Y' -> Cyrillic 'У'
    'x': 'х',   # Latin 'x' -> Cyrillic 'х'
    'X': 'Х',   # Latin 'X' -> Cyrillic 'Х'
    'H': 'Н',  # Latin 'H' -> Cyrillic 'Н'
    'K': 'К',  # Latin 'K' -> Cyrillic 'К'
    'M': 'М',  # Latin 'M' -> Cyrillic 'М'
    'T': 'Т',  # Latin 'T' -> Cyrillic 'Т'
    'B': 'В',  # Latin 'B' -> Cyrillic 'В'
}

# Reverse mapping: Cyrillic to Latin (to handle the inverse direction)
cyrillic_to_latin = {v: k for k, v in latin_to_cyrillic.items()}

trans_table = str.maketrans(latin_to_cyrillic)

# Normalize a word to a standard Latin form by handling both Latin->Cyrillic and Cyrillic->Latin transformations
def normalize_word(word):
    return  word.translate(trans_table)

# Read the CSV data
data = df

# convert word to string
data['word'] = data['word'].astype(str)

# size before
print(data.shape)

# Create a dictionary to store the most frequent word for each normalized form
normalized_dict = {}

# Process each word and count the most frequent occurrence of similar-looking words
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    word = row['word']
    count = row['count']
    
    # Normalize the word
    normalized_word = normalize_word(word)
    
    # If the normalized word is already in the dictionary, compare counts
    if normalized_word in normalized_dict:
        # Keep the most frequent one
        if normalized_dict[normalized_word][1] < count:
            normalized_dict[normalized_word] = (word, count)
    else:
        normalized_dict[normalized_word] = (word, count)

# Create a final list of words with the highest frequency in each group
final_words = [(word, count) for word, count in normalized_dict.values()]

# Sort by count (highest first)
final_words.sort(key=lambda x: x[1], reverse=True)

# Convert the result to a DataFrame
final_df = pd.DataFrame(final_words, columns=['word', 'count'])

# size after
print(final_df.shape)

# Save or display the final output
final_df.to_csv('./corpus_output/cleaned_words.csv', sep='\t', index=False)

100%|██████████| 5431985/5431985 [03:12<00:00, 28158.78it/s]


In [3]:
import math
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('./corpus_output/cleaned_words.csv', sep='\t')
df = df[df['count'] >= 5]
df['word_len'] = df['word'].apply(str).apply(len)
df = df[df['word_len'] < 26]
df = df.drop(columns=['word_len'])

# write only words to txt file, several times: loge(count / 10)
with open('corpus_words.txt', 'w', encoding='utf-8') as f:
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        word = str(row['word'])
        for i in range(math.ceil(math.log(row['count'] / 100))):
            f.write(word)
            f.write('\n')

100%|██████████| 5303273/5303273 [02:44<00:00, 32165.15it/s]
