In [2]:
import os
import pandas as pd
from  dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("GORQ_API_KEY")

In [6]:
## setup LLM and Embedding models

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq

# Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
llm = ChatGroq(
    model_name="meta-llama/llama-4-scout-17b-16e-instruct",
    groq_api_key=api_key,
    temperature=0.7)


In [None]:
import os
from pathlib import Path
from typing import List, Dict, Any
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

class ProcessAndChunkPDFs:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """
        Initialize PDF processor with chunking parameters.
        
        Args:
            chunk_size: Size of each text chunk
            chunk_overlap: Overlap between chunks for context preservation
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def read_pdf(self, file_path: str) -> Dict[str, Any]:
        """
        Read PDF file and extract text with metadata.
        
        Args:
            file_path: Path to the PDF file
            
        Returns:
            Dictionary containing text and metadata
        """
        try:
            pdf_reader = PdfReader(file_path)
            text = ""
            page_count = len(pdf_reader.pages)
            
            for page_num, page in enumerate(pdf_reader.pages):
                text += f"\n--- Page {page_num + 1} ---\n"
                text += page.extract_text()
            
            # Extract PDF metadata
            metadata = {
                "source": file_path,
                "file_name": os.path.basename(file_path),
                "page_count": page_count,
                "file_size": os.path.getsize(file_path),
                "document_type": "PDF"
            }
            
            return {"text": text, "metadata": metadata}
        
        except Exception as e:
            print(f"Error reading PDF {file_path}: {str(e)}")
            return {"text": "", "metadata": {}}
    
    def chunk_with_metadata(self, text: str, base_metadata: Dict[str, Any]) -> List[Document]:
        """
        Chunk text and add metadata to each chunk.
        
        Args:
            text: Full text to be chunked
            base_metadata: Base metadata dictionary
            
        Returns:
            List of Document objects with metadata
        """
        chunks = self.text_splitter.split_text(text)
        documents = []
        
        for chunk_num, chunk in enumerate(chunks):
            metadata = {
                **base_metadata,
                "chunk_index": chunk_num,
                "chunk_size": len(chunk),
                "total_chunks": len(chunks)
            }
            
            doc = Document(page_content=chunk, metadata=metadata)
            documents.append(doc)
        
        return documents
    
    def process_pdf_directory(self, directory_path: str) -> List[Document]:
        """
        Process all PDFs in a directory.
        
        Args:
            directory_path: Path to directory containing PDFs
            
        Returns:
            List of chunked documents with metadata
        """
        all_documents = []
        pdf_files = Path(directory_path).glob("*.pdf")
        
        for pdf_file in pdf_files:
            print(f"Processing: {pdf_file.name}")
            
            # Read PDF
            pdf_data = self.read_pdf(str(pdf_file))
            
            if pdf_data["text"]:
                # Chunk with metadata
                documents = self.chunk_with_metadata(
                    pdf_data["text"],
                    pdf_data["metadata"]
                )
                all_documents.extend(documents)
                print(f"  ✓ Extracted {len(documents)} chunks")
            else:
                print(f"  ✗ Failed to extract text")
        
        return all_documents



In [None]:

# Example usage
# if __name__ == "__main__":
#     # Initialize processor
#     processor = PDFProcessor(chunk_size=1000, chunk_overlap=200)
    
#     # Process single PDF
#     single_pdf_path = "path/to/your/document.pdf"
#     if os.path.exists(single_pdf_path):
#         pdf_data = processor.read_pdf(single_pdf_path)
#         documents = processor.chunk_with_metadata(pdf_data["text"], pdf_data["metadata"])
        
#         print(f"\nProcessed {len(documents)} chunks:")
#         for doc in documents[:2]:  # Print first 2 chunks
#             print(f"\nMetadata: {doc.metadata}")
#             print(f"Content: {doc.page_content[:200]}...")
    
#     # Process entire directory
#     pdf_directory = "path/to/pdf/directory"
#     if os.path.exists(pdf_directory):
#         all_docs = processor.process_pdf_directory(pdf_directory)
#         print(f"\n\nTotal documents processed: {len(all_docs)}")