In [2]:
import collections
import re
import csv
import pandas as pd
from tqdm import tqdm

def process_chunk(chunk, word_counts):
    chunk = chunk[chunk['contains_kaz_symbols'] == 1]
    for _, row in chunk.iterrows():
        text = str(row['text'])
        words = re.findall(r'\b[^\W\d_]{1,}\b', text.lower())
        word_counts.update(words)

def get_unigram_frequencies(in_file, out_file, chunk_size=1000):

    word_counts = collections.Counter()

    # Process the CSV file in chunks
    print('Processing CSV file in chunks...')
    for chunk in tqdm(pd.read_csv(in_file, chunksize=chunk_size)):
        process_chunk(chunk, word_counts)

    # Sort the words by frequency
    print('Sorting words by frequency...')
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    # Write the word frequencies to the output file
    print('Writing to output file...')
    with open(out_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['word', 'count'])  # Write header
        for word, count in sorted_words:
            if count < 10:
                continue
            writer.writerow([word, count])

if __name__ == '__main__':
    # Test the function
    get_unigram_frequencies('kazakhBooks.csv', 'output_unigrams.csv', chunk_size=1000)

Processing CSV file in chunks...


9it [02:19, 15.47s/it]


Sorting words by frequency...
Writing to output file...
