In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pypdf


Collecting pypdf
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-5.0.1-py3-none-any.whl (294 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/294.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m286.7/294.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.0.1


In [None]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.4.0,>=0.3.4 (from langchain-community)
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.12 (from langchain-community)
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-community)
  Downloading langsmith-0.1.137-py3-none-any.whl.metadata (13 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.0-py3-none-any.whl.metadata (7.6 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [None]:
import os
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
import os
import json
import re

# Path to the main dataset directory
dataset_path = '/content/drive/MyDrive/Policy_Bot_Data/dataset'

# Initialize list to store all chunks
all_chunks = []
folder_count = 0
pdf_count = 0

# Function to clean folder and file names
def clean_name(name):
    # Remove parentheses and their content
    name = re.sub(r'\s*\([^)]*\)', '', name)
    # Remove .pdf extension
    name = re.sub(r'\.pdf$', '', name, flags=re.IGNORECASE)
    return name.strip()

# Function to process PDF
def process_pdf(pdf_path, folder_name):
    global pdf_count
    try:
        # Loading pdf
        loader = PyPDFLoader(file_path=pdf_path)
        docs_before_split = loader.load()

        # Check if the document was loaded properly
        if len(docs_before_split) == 0:
            print(f"Warning: No content found in {pdf_path}")
            return

        # Initialize the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=700,
            chunk_overlap=50,
        )

        # Split the documents into chunks
        docs_after_split = text_splitter.split_documents(docs_before_split)

        # Check if the document was split correctly
        if len(docs_after_split) == 0:
            print(f"Warning: No chunks created from {pdf_path}")
            return

        # Clean the folder and file names
        clean_folder_name = clean_name(folder_name)
        clean_file_name = clean_name(os.path.basename(pdf_path))

        # Prepare the chunks in a dictionary format, including the cleaned folder and file name
        for i, doc in enumerate(docs_after_split):
            chunk_data = {
                'folder_name': clean_folder_name,
                'file_name': clean_file_name,
                'chunk_id': i + 1,
                'content': doc.page_content
            }
            all_chunks.append(chunk_data)
        pdf_count += 1
        print(f"Processed {pdf_path}, created {len(docs_after_split)} chunks.")
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")

# Walk through the main dataset directory and process all PDF files
for root, dirs, files in os.walk(dataset_path):
    if files:
        folder_count += 1  # Increment folder count when files are present
    for file_name in files:
        if file_name.endswith('.pdf'):
            pdf_path = os.path.join(root, file_name)
            folder_name = os.path.basename(root)  # Get the folder name for indexing
            process_pdf(pdf_path, folder_name)

# Check if there are any chunks to save
if all_chunks:
    # Save all chunks to a JSON file
    output_path = '/content/chunked_data_all_folders_cleaned.json'
    with open(output_path, 'w') as json_file:
        json.dump(all_chunks, json_file, indent=4)
    print(f"All PDF chunks have been saved to {output_path}")
else:
    print("No chunks were created. Please check the input files.")

# Print the number of folders traversed and the number of PDFs processed
print(f"Number of folders traversed: {folder_count}")
print(f"Number of PDFs processed: {pdf_count}")


Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-01 Wellness Policy .pdf, created 220 chunks.
Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-02 Phys Ed & Physical Activity.pdf, created 45 chunks.
Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-03 Comprehensive Health Ed.pdf, created 27 chunks.
Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-04 Healthy School Environment Policy.pdf, created 19 chunks.
Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Health & Wellness (HWD)/HWD-06 Tobacco-Nicotine Policy.pdf, created 35 chunks.
Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-01 Exam School Application and Admissions.pdf, created 38 chunks.
Processed /content/drive/MyDrive/Policy_Bot_Data/dataset/Enrollment Planning and Support (AMT)/AMT-03 DYS Committed Students.pdf, created 26 chunks