In [5]:
import os
import json
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import spacy




In [6]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [11]:


# Path to the main dataset directory containing text files
dataset_path = '../data/data_txt1'
dataset_links_path = '../data/data_txt/_dataset_links/dataset_links.txt'

# Initialize list to store all chunks
all_chunks = []
text_file_count = 0

# Function to clean file names
def clean_name(name):
    # Remove parentheses and their content
    name = re.sub(r'\s*\([^)]*\)', '', name)
    # Remove .txt extension
    name = re.sub(r'\.txt$', '', name, flags=re.IGNORECASE)
    return name.strip()

# Load dataset links into a dictionary
def load_links(file_path):
    links_dict = {}
    with open(file_path, 'r') as f:
        for line in f:
            if '.pdf:' in line:
                file_name, link = line.split('.pdf:')
                links_dict[clean_name(file_name.strip())] = link.strip()
    return links_dict

# Load links
file_links = load_links(dataset_links_path)

# print(file_links)

# Function to process text files
def process_text_file(file_path, folder_name):
    global text_file_count
    try:
        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as f:
            text_content = f.read()

        # Check if the text file has content
        if not text_content.strip():
            print(f"Warning: No content found in {file_path}")
            return

        # Initialize the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=700,
            chunk_overlap=50,
        )

        # Split the text content into chunks
        chunks = text_splitter.split_text(text_content)

        # Check if the text was split correctly
        if not chunks:
            print(f"Warning: No chunks created from {file_path}")
            return

        # Clean the file name and folder name
        clean_folder_name = clean_name(folder_name)
        clean_file_name = clean_name(os.path.basename(file_path))

        # path = os.path.basename(file_path).strip().replace('.txt', '')

        # print(path)

        # Retrieve the link for the file
        file_uri = file_links.get(clean_file_name, None)
        # print(file_links)
        # print(file_uri)
        # print(path)
        #print(file_links.keys())
        if file_uri is None:
            print(f"Warning: No link found for {clean_file_name}")
            return

        # Prepare the chunks in a dictionary format
        for i, chunk in enumerate(chunks):
            chunk_data = {
                'folder_name': clean_folder_name,
                'file_name': clean_file_name,
                'chunk_id': i + 1,
                'uri': file_uri,  # Add the link as metadata
                'content': chunk
            }
            all_chunks.append(chunk_data)
        text_file_count += 1
        # print(f"Processed {file_path}, created {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

# # Process all text files in the dataset path
# for file_name in os.listdir(dataset_path):
#     if file_name.endswith('.txt'):
#         file_path = os.path.join(dataset_path, file_name)
#         folder_name = os.path.basename(root)
#         process_text_file(file_path, folder_name)

# Walk through the main dataset directory and process all PDF files
for root, dirs, files in os.walk(dataset_path):
    for file_name in files:
        if file_name.endswith('.txt'):
            file_path = os.path.join(root, file_name)
            folder_name = os.path.basename(root)  # Get the folder name for indexing
            process_text_file(file_path, folder_name)

# Check if there are any chunks to save
if all_chunks:
    # Save all chunks to a JSON file
    output_path = '../data/data_json/chunked_data_all_text_files_with_links.json'
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(all_chunks, json_file, indent=4, ensure_ascii=False)
    print(f"All text file chunks have been saved to {output_path}")
else:
    print("No chunks were created. Please check the input files.")

# Print the number of text files processed
print(f"Number of text files processed: {text_file_count}")


All text file chunks have been saved to chunked_data_all_text_files_with_links.json
Number of text files processed: 189
