In [4]:
import argparse
import os
import re
from tqdm import tqdm

In [12]:
from utils import strip_headers

In [1]:
def is_english(text, threshold=0.9):
    ascii_chars = sum(1 for c in text if ord(c) < 128)
    return ascii_chars / len(text) > threshold


In [28]:
data_dir = "/home/david/Documents/data_science/datasets/gutenberg/gutenberg_dataset"
output_dir = "/home/david/Documents/data_science/datasets/gutenberg/gutenberg_preprocessed"
max_size_mb = 500
separator="<|endoftext|>"
fallback_encoding="latin1"

In [5]:
all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(data_dir)
                 for name in files if name.endswith((".txt", ".txt.utf8"))]

In [6]:
all_files

['/home/david/Documents/data_science/datasets/gutenberg_dataset/1_The_Declaration_of_Independence_of_the_United_States_of_America/1-0.txt',
 '/home/david/Documents/data_science/datasets/gutenberg_dataset/10_The_King_James_Version_of_the_Bible/10-0.txt',
 "/home/david/Documents/data_science/datasets/gutenberg_dataset/4_Lincoln's_Gettysburg_Address\r\nGiven_November_19,_1863_on_the_battlefield_near_Gettysburg,_Pennsylvania,_USA/4.txt",
 "/home/david/Documents/data_science/datasets/gutenberg_dataset/8_Abraham_Lincoln's_Second_Inaugural_Address/8.txt",
 '/home/david/Documents/data_science/datasets/gutenberg_dataset/6_Give_Me_Liberty_or_Give_Me_Death/6.txt',
 '/home/david/Documents/data_science/datasets/gutenberg_dataset/2_The_United_States_Bill_of_Rights\r\nThe_Ten_Original_Amendments_to_the_Constitution_of_the_United_States/2.txt',
 '/home/david/Documents/data_science/datasets/gutenberg_dataset/5_The_United_States_Constitution/5.txt']

In [7]:
print(f"{len(all_files)} file(s) to process.")

7 file(s) to process.


In [11]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

current_content = []
current_size = 0
file_counter = 1

## Try one file

In [14]:
file_path = all_files[0]

with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

In [16]:
len(content)

118485

In [17]:
is_english(content)

True

In [18]:
content_stripped = strip_headers(content)

In [20]:
content_stripped[:30]

'\n\nThis is a retranscription of'

In [21]:
content[:30]

'\n*** START OF THE PROJECT GUTE'

In [22]:
# Regular expression to replace multiple blank lines with a single blank line
content = re.sub(r'\n\s*\n', '\n\n', content)
estimated_size = len(content.encode("utf-8"))

In [23]:
estimated_size

118353

In [24]:
current_size + estimated_size > max_size_mb * 1024 * 1024

False

In [25]:
target_file_path = os.path.join(output_dir, f"combined_{file_counter}.txt")
target_file_path

'/home/david/Documents/data_science/datasets/gutenberg_preprocessed/combined_1.txt'

# Pre-process all files

In [26]:
for file_path in tqdm(all_files):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
    except UnicodeDecodeError:
        # Attempt to read the file with a fallback encoding
        tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
        with open(file_path, "r", encoding=fallback_encoding) as file:
            content = file.read()

    if not is_english(content):
        tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
        continue
    content = strip_headers(content)

    # Regular expression to replace multiple blank lines with a single blank line
    content = re.sub(r'\n\s*\n', '\n\n', content)
    estimated_size = len(content.encode("utf-8"))

    if current_size + estimated_size > max_size_mb * 1024 * 1024:
        target_file_path = os.path.join(output_dir, f"combined_{file_counter}.txt")
        with open(target_file_path, "w", encoding="utf-8") as target_file:
            target_file.write(separator.join(current_content))
        file_counter += 1
        current_content = [content]
        current_size = estimated_size
    else:
        current_content.append(content)
        current_size += estimated_size

if current_content:
    target_file_path = os.path.join(output_dir, f"combined_{file_counter}.txt")
    with open(target_file_path, "w", encoding="utf-8") as target_file:
        target_file.write(separator.join(current_content))

100%|██████████| 7/7 [00:00<00:00, 24.91it/s]


In [29]:
# The resulting files are saved in:
output_dir

'/home/david/Documents/data_science/datasets/gutenberg/gutenberg_preprocessed'

In [33]:
!ls -lh '/home/david/Documents/data_science/datasets/gutenberg/gutenberg_preprocessed'


total 4.3M
-rw-rw-r-- 1 david david 4.3M Dec 16 19:50 combined_1.txt


In [34]:
print(f"{file_counter} file(s) saved in {os.path.abspath(output_dir)}")

1 file(s) saved in /home/david/Documents/data_science/datasets/gutenberg/gutenberg_preprocessed
