In [1]:
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm

# Make sure NLTK's sentence and word tokenizers are downloaded
nltk.download('punkt')

# Function to read the content of a text file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to split text into sentence-based chunks with word count limit and overlap
def shard_text_by_sentences(text, max_words_per_chunk=1024, overlap_sentences=10):
    sentences = sent_tokenize(text)  # Split text into sentences
    chunks = []
    current_chunk = []
    current_word_count = 0

    for i in range(len(sentences)):
        sentence = sentences[i]
        sentence_word_count = len(word_tokenize(sentence))

        # If adding this sentence exceeds the max words limit, finalize the current chunk
        if current_word_count + sentence_word_count > max_words_per_chunk:
            chunks.append(current_chunk)
            current_chunk = sentences[max(0, i - overlap_sentences):i + overlap_sentences]
            current_word_count = sum(len(word_tokenize(s)) for s in current_chunk)
        else:
            current_chunk.append(sentence)
            current_word_count += sentence_word_count

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Function to save each chunk into a separate text file
def save_shard(chunk, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Join sentences in the chunk and save to file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(chunk))

# Function to process all text files in a directory
def process_directory(input_dir, output_dir, max_words_per_chunk=1024, overlap_sentences=10):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]

            # Read file content and shard it
            text = read_file(file_path)
            shards = shard_text_by_sentences(text, max_words_per_chunk, overlap_sentences)

            # Save each shard
            for index, shard in enumerate(shards):
                save_shard(shard, output_dir, file_base_name, index)

    print("All files processed.")

# Define input and output directories
input_directory = '../../data/crawled/crawled_text_data'  # Directory containing text files
output_directory = '../../data/crawled/crawled_text_data_test'  # Directory to save the shards

# Process the directory and shard files
process_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 161/161 [00:14<00:00, 11.06it/s]

All files processed.





In [2]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

# Make sure NLTK's sentence and word tokenizers are downloaded
nltk.download('punkt')

# Function to read the content of a text file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to split text into word-based chunks with overlap
def shard_text_by_words(text, max_words_per_chunk=1024, overlap_words=200):
    words = word_tokenize(text)  # Tokenize text into words
    chunks = []
    step_size = max_words_per_chunk - overlap_words // 2  # Step size for moving the window

    for i in range(0, len(words), step_size):
        start = max(0, i - overlap_words // 2)  # Ensure we include previous overlap
        end = min(len(words), i + max_words_per_chunk + overlap_words // 2)  # Ensure next overlap
        chunk = words[start:end]
        chunks.append(chunk)

    return chunks

# Function to save each chunk into a separate text file
def save_shard(chunk, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Join words in the chunk and save to file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(chunk))

# Function to process all text files in a directory
def process_directory(input_dir, output_dir, max_words_per_chunk=1024, overlap_words=200):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]

            # Read file content and shard it
            text = read_file(file_path)
            shards = shard_text_by_words(text, max_words_per_chunk, overlap_words)

            # Save each shard
            for index, shard in enumerate(shards):
                save_shard(shard, output_dir, file_base_name, index)

    print("All files processed.")

# Define input and output directories
input_directory = '../../data/crawled/crawled_text_data'  # Directory containing text files
output_directory = '../../data/crawled/crawled_text_data_test'  # Directory to save the shards

# Process the directory and shard files
process_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 161/161 [00:02<00:00, 61.56it/s]

All files processed.





In [5]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

# Make sure NLTK's sentence and word tokenizers are downloaded
nltk.download('punkt')

# Function to read the content of a text file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to split text into sentence-based chunks with word-based overlap
def shard_text_by_sentences(text, max_words_per_chunk=1024, overlap_words=200):
    sentences = sent_tokenize(text)  # Tokenize text into sentences
    chunks = []
    current_chunk = []
    current_word_count = 0

    for i, sentence in enumerate(sentences):
        sentence_words = word_tokenize(sentence)
        sentence_word_count = len(sentence_words)

        # Check if adding this sentence will exceed the word limit for the chunk
        if current_word_count + sentence_word_count > max_words_per_chunk:
            # Finalize the current chunk
            chunks.append(current_chunk)

            # Reset current_chunk with overlap from the previous chunk and new sentence
            overlap = current_chunk[-overlap_words:] if len(current_chunk) > overlap_words else current_chunk
            current_chunk = overlap + sentence_words
            current_word_count = len(current_chunk)

        else:
            current_chunk += sentence_words
            current_word_count += sentence_word_count

    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Function to save each chunk into a separate text file
def save_shard(chunk, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Join words in the chunk and save to file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(chunk))

# Function to process all text files in a directory
def process_directory(input_dir, output_dir, max_words_per_chunk=1024, overlap_words=200):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]

            # Read file content and shard it
            text = read_file(file_path)
            shards = shard_text_by_sentences(text, max_words_per_chunk, overlap_words)

            # Save each shard
            for index, shard in enumerate(shards):
                save_shard(shard, output_dir, file_base_name, index)

    print("All files processed.")

# Define input and output directories
input_directory = '../../data/crawled/crawled_text_data'  # Directory containing text files
output_directory = '../../data/crawled/crawled_text_data_50_sentence'  # Directory to save the shards

# Process the directory and shard files
process_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 161/161 [00:03<00:00, 50.10it/s]

All files processed.





In [6]:
import os
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# Make sure NLTK's word tokenizer is downloaded
import nltk
nltk.download('punkt')

# Function to read the content of a text file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to split a large chunk into smaller chunks if it exceeds 1024 words
def split_chunk_by_words(text, max_words_per_chunk=1024):
    words = word_tokenize(text)  # Tokenize the chunk into words
    chunks = []
    
    for i in range(0, len(words), max_words_per_chunk):
        chunk = words[i:i + max_words_per_chunk]
        chunks.append(chunk)

    return chunks

# Function to save the new split chunks into separate text files
def save_shard(chunk, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Join words in the chunk and save to file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(chunk))

# Function to process the directory of chunks and further split chunks larger than 1024 words
def process_chunk_directory(input_dir, output_dir, max_words_per_chunk=1024):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]

            # Read the chunk file content
            text = read_file(file_path)
            
            # Check the word count of the chunk and split if necessary
            chunks = split_chunk_by_words(text, max_words_per_chunk)

            # Save the new smaller chunks
            for index, chunk in enumerate(chunks):
                save_shard(chunk, output_dir, file_base_name, index)

    print("All files processed.")

# Define input and output directories
input_directory = '../../data/crawled/crawled_text_data_sentence_50_5'  # Directory containing text files
output_directory = '../../data/crawled/crawled_text_data_max_1024'  # Directory to save the shards

# Process the directory and split chunks if needed
process_chunk_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  0%|          | 0/688 [00:00<?, ?it/s]

100%|██████████| 688/688 [00:03<00:00, 223.24it/s]

All files processed.



