### 1. Data Ingestion to Vector database 

1.1. Read documents 

In [5]:
import os
from pathlib import Path

from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
def process_all_documents(pdf_dir):
    all_documents = []
    pdf_dir = Path(pdf_dir)

    pdf_files = list(pdf_dir.glob('*.pdf'))
    print(f"Found {len(pdf_files)} PDF files in {pdf_dir}")

    for pdf_file in pdf_files:
        print(f'Processing file {pdf_file}')
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for document in documents:
                metadata = document.metadata
                metadata['source'] = str(pdf_file)
                document.metadata = metadata
                all_documents.append(document)
            print(f'Loaded {len(documents)} pages')
        except Exception as e:
            print(f"Error processing file {pdf_file}: {e}")
    
    print(f'Total documents: {len(all_documents)}')

    return all_documents

In [14]:
pdf_dir = '../data/pdf/'
all_documents = process_all_documents(pdf_dir)

Found 3 PDF files in ..\data\pdf
Processing file ..\data\pdf\attention-is-all-you-need.pdf
Loaded 11 pages
Processing file ..\data\pdf\Deep_Residual_Learning_CVPR_2016_paper.pdf
Loaded 9 pages
Processing file ..\data\pdf\Generic Algorithm.pdf
Loaded 4 pages
Total documents: 24


In [17]:
print(type(all_documents[0]))

<class 'langchain_core.documents.base.Document'>


1.2 Chunking

In [22]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )

    splitted_docs = splitter.split_documents(documents)
    if splitted_docs:
        print(f'Total splitted documents splitted from {len(documents)} documents: {len(splitted_docs)} chunks')
        print(f'First splitted document content:\n{splitted_docs[0].page_content[:50]}\n')
    else:
        print('No documents were splitted. Please check the input documents.')
    return splitted_docs

chunks = split_documents(all_documents)

Total splitted documents splitted from 24 documents: 104 chunks
First splitted document content:
Attention Is All You Need
Ashish Vaswani∗
Google B



1.3. Embedding all documents

In [23]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
