In [1]:
from tqdm import tqdm
import os
import lzma

In [2]:
def xz_files_in_dir(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
            files.append(filename)
    return files

folder_path = "openwebtext"
output_file_train = "output_train.txt"
output_file_val = "output_val.txt"
vocab_file = "vocab.txt"
# split_files = int(input("How many files would you like to split this into?"))

In [3]:
files = xz_files_in_dir(folder_path)

total_files = len(files)

split_index = int(total_files * 0.9)
files_train = files[:split_index]
files_val = files[split_index:]

vocab = set()

with open(output_file_train, "w", encoding = "utf-8") as outfile:
    for filename in tqdm(files_train, total = len(files_train)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

with open(output_file_val, "w", encoding = "utf-8") as outfile:
    for filename in tqdm(files_val, total = len(files_val)):
        file_path = os.path.join(folder_path, filename)
        with lzma.open(file_path, "rt", encoding="utf-8") as infile:
            text = infile.read()
            outfile.write(text)
            characters = set(text)
            vocab.update(characters)

# for i in range(split_files):
#     with open(output_file.format(i), "w", encoding="utf-8") as outfile:
#         for count, filename in enumerate(tqdm(files[:max_count], total=max_count)):
#             if count >= max_count:
#                 break
#             file_path = os.path.join(folder_path, filename)
#             with lzma.open(file_path, "rt", encoding="utf-8") as infile:
#                 text = infile.read()
#                 outfile.write(text)
#                 characters = set(text)
#                 vocab.update(characters)
#         files = files[max_count:]
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in vocab:
        vfile.write(char + '\n')

100%|██████████| 775/775 [01:16<00:00, 10.07it/s]
100%|██████████| 87/87 [00:08<00:00,  9.77it/s]


In [4]:
def process_files_in_parallel(files, folder_path, output_file):
    vocab = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
        args = [(folder_path, filename, output_file, vocab) for filename in files]
        for characters in tqdm(executor.map(process_file, args), total=len(files)):
            vocab.update(characters)
    return vocab

In [6]:
folder_path = "openwebtext"
output_file_train = "output_train.txt"
output_file_val = "output_val.txt"
vocab_file = "vocab.txt"

files = xz_files_in_dir(folder_path)
total_files = len(files)

split_index = int(total_files * 0.9)  # 90% for training
files_train = files[:split_index]
files_val = files[split_index:]

# Ensure output files are empty before appending
open(output_file_train, 'w').close()
open(output_file_val, 'w').close()

# Process the training files
vocab_train = process_files_in_parallel(files_train, folder_path, output_file_train)

# Process the validation files
vocab_val = process_files_in_parallel(files_val, folder_path, output_file_val)

# Combine vocabularies (if needed) and write to vocab.txt
vocab = vocab_train.union(vocab_val)
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in sorted(vocab):
        vfile.write(char + '\n')

0it [00:00, ?it/s]
0it [00:00, ?it/s]
