In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib  import Path



In [4]:
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents=[]
    pdf_dir=Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    for pdf_file in pdf_files:
        try:
            loader=PyPDFLoader(str(pdf_file))
            documents=loader.load()

            for doc in documents:
                doc.metadata['source_file']=pdf_file.name
                doc.metadata['file_type'] ='pdf'
            all_documents.extend(documents)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents


all_pdf_documents=process_all_pdfs("../data")



Total documents loaded: 75


In [6]:
#text splitting get into chunks
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """split documents into smaller chunks for better Rag performance"""
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    split_docs=text_splitter.split_documents(documents)

    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs
chunks=split_documents(all_pdf_documents)
chunks





Example chunk:
Content: Customer Segmentation Using Clustering
Algorithms on Online Retail Data
Dilhara Disanayaka
Department of Computer Science & Engineering
University of Moratuwa, Sri Lanka
Email: dilhara.22@cse.mrt.ac.l...
Metadata: {'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-25T18:16:09+00:00', 'author': '', 'keywords': '', 'moddate': '2025-08-25T18:16:09+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf_files\\220131A_Clusturing.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'source_file': '220131A_Clusturing.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-08-25T18:16:09+00:00', 'author': '', 'keywords': '', 'moddate': '2025-08-25T18:16:09+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf_files\\220131A_Clusturing.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'source_file': '220131A_Clusturing.pdf', 'file_type': 'pdf'}, page_content='Customer Segmentation Using Clustering\nAlgorithms on Online Retail Data\nDilhara Disanayaka\nDepartment of Computer Science & Engineering\nUniversity of Moratuwa, Sri Lanka\nEmail: dilhara.22@cse.mrt.ac.lk\nAbstract—This study explores customer segmentation in online\nshopping using clustering algorithms to be applied on purchase\nbehavior data. We experimented with three algorithms—K-\nmeans, hierarchical clustering, and DBSCAN—on RFM (Re-\ncency, Frequenc