In [None]:
def preprocess_file(file_path, preprocessed_file):
    with open(file_path, "r", encoding="utf-8") as f:
        with open(preprocessed_file, "w", encoding="utf-8") as out:
            for line in tqdm(f, desc="Preprocessing", total=len(f)):
                tokens = preprocess(line)
                out.write(" ".join(tokens) + "\n")

In [None]:
import gensim
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

sample_percentage = 0.01
sample_file = f"sample_{sample_percentage}.txt"
preprocessed_file = "preprocessed_corpus.txt"
chunk_size = 1000

def preprocess(line):
    tokens = gensim.utils.simple_preprocess(line, deacc=False, min_len=2, max_len=15)
    return tokens

def parallel_preprocess(file_path, total_lines, output_file, pool_size=None, chunk_size=1000):
    if pool_size is None:
        pool_size = cpu_count()

    with Pool(processes=pool_size) as pool:
        with open(file_path, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
            lines = []
            for i, line in enumerate(tqdm(infile, total=total_lines, desc="Preprocessing")):
                lines.append(line)  # remove any extra whitespace or newline characters
                
                if (i + 1) % chunk_size == 0:
                    tokens_chunk = pool.map(preprocess, lines)
                    for tokens in tokens_chunk:
                        outfile.write(" ".join(tokens) + "\n")
                    outfile.flush()  # ensure the data is written to file
                    lines = []

            if lines:  # process remaining lines
                tokens_chunk = pool.map(preprocess, lines)
                for tokens in tokens_chunk:
                    outfile.write(" ".join(tokens) + "\n")
                outfile.flush()

class TextCorpus:
    def __init__(self, file_path):
        self.file_path = file_path

    def __len__(self):
        with open(self.file_path, "r", encoding="utf-8") as f:
            return sum(1 for _ in f)

    def __iter__(self):
        with open(self.file_path, "r", encoding="utf-8") as f:
            for line in tqdm(f, total=total_lines, desc="Training"):
                yield line

# Calculate total lines in the corpus file
total_lines = len(TextCorpus(sample_file))

# Preprocess the file in parallel
parallel_preprocess(sample_file, total_lines, preprocessed_file, pool_size=cpu_count(), chunk_size=chunk_size)
