# Ingesting PDF

In [32]:
from PyPDF2 import PdfReader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb import Client
from chromadb.config import Settings
from langchain.embeddings import fastembed  # Assuming fastembed is compatible here
import os
import re

# Set up for ChromaDB
client = Client(Settings())

# First document
pdf_path = "../backend/data/ersattningsmodell_vaders_2019.pdf"

def clean_text(text: str) -> str:

    if not isinstance(text, str):
        raise ValueError("Input must be a string.")

    # Remove excessive newlines and leading/trailing whitespace
    text = re.sub(r'\n+', '\n', text).strip()

    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)

    return text

# Function to parse PDF and convert to Documents
def parse_pdf_with_pypdf(pdf_path):
    """Extracts text from a PDF using PyPDF2 and converts each page to a LangChain Document."""
    reader = PdfReader(pdf_path)
    documents = []
    
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            cleaned_text = clean_text(text)  # Clean the text immediately after extraction
            documents.append(Document(page_content=cleaned_text, metadata={"page_number": page_num + 1}))
    
    return documents

# Load PDF and parse it
docs = parse_pdf_with_pypdf(pdf_path)

# Cleaning the text of each document
for doc in docs:
    doc.page_content = clean_text(doc.page_content)

print(f"Extracted {len(docs)} pages from PDF.")

Extracted 42 pages from PDF.


In [39]:
docs[2].page_content

'2018- 09-05 3 1 Allmänt 1.1 Beräkningsmodell En bra beräkningsmodell kräver, utifrån vinterns karaktär, två väl fungerande delmodeller för att reglera kostnader för vinterväghållningsåtgärder mellan beställare och utförare. − En delmodell som beskriver vädret under vintersäsongen − En delmodell som kopplar väderbeskrivningar till åtgärdsbehov/r esursinsatser, dock inte nödvändigtvis i ett 1 -1- förhållande. Denna rapport redovisar i detalj hur Trafikverket s beräkningsmodell för ersättning av vinterväghållni ngsåtgärder, version VädErs 2019, fungerar från utgångspunkten, mätdata från VViS och MESAN, till slutfasen ersättningsunderlag i form av väderutfall. Följande händelsekedja visar beräkningsgången i stort . Mätdata från VViS och MESAN → vädersituationer på halvtimmesnivå → vädersituationer på timnivå → vädertillfällen → väderutfall Första delen av händelsekedjan, fram till och med vädersituationer på timnivå, bildar väderbeskriv -\nningsmodellen. Därefter vidtar kopplingen mellan 

### text splitter and chuncking configuration

In [41]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

# Split pages into chunks
chunks = []
for doc in docs:
    chunks.extend(text_splitter.split_text(doc.page_content))

print(f"Split PDF text into {len(chunks)} chunks.")

Split PDF text into 63 chunks.


In [42]:
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

# Initialize ChromaDB client (in-memory configuration)
client = Client(Settings())

# Initialize FastEmbedEmbeddings for embeddings
embedder = FastEmbedEmbeddings()

# Convert text chunks into embeddings
embeddings = embedder.embed_documents(chunks)

if "pdf_docs" in client.list_collections():
    client.delete_collection("pdf_docs")

# Create a collection in ChromaDB for storing documents and embeddings
collection = client.create_collection(name="pdf_docs", get_or_create=True)

# Index each chunk with its embedding in ChromaDB (without metadata)
for i, chunk in enumerate(chunks):
    collection.add(
        documents=[chunk],
        embeddings=[embeddings[i]],
        metadatas=[{"chunk_index": i}],
        ids=[f"chunk_{i}"]
    )

print("Embedded and indexed chunks in ChromaDB.")


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 29046.43it/s]
Add of existing embedding ID: chunk_0
Insert of existing embedding ID: chunk_0
Add of existing embedding ID: chunk_1
Insert of existing embedding ID: chunk_1
Add of existing embedding ID: chunk_2
Insert of existing embedding ID: chunk_2
Add of existing embedding ID: chunk_3
Insert of existing embedding ID: chunk_3
Add of existing embedding ID: chunk_4
Insert of existing embedding ID: chunk_4
Add of existing embedding ID: chunk_5
Insert of existing embedding ID: chunk_5
Add of existing embedding ID: chunk_6
Insert of existing embedding ID: chunk_6
Add of existing embedding ID: chunk_7
Insert of existing embedding ID: chunk_7
Add of existing embedding ID: chunk_8
Insert of existing embedding ID: chunk_8
Add of existing embedding ID: chunk_9
Insert of existing embedding ID: chunk_9
Add of existing embedding ID: chunk_10
Insert of existing embedding ID: chunk_10
Add of existing embedding ID: chunk_11
Insert of existing embe

Embedded and indexed chunks in ChromaDB.


## Retrieval

In [45]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [44]:
from langchain_ollama import ChatOllama
from langchain.chains import retrieval_qa

local_llm = "llama3.2:1b"
llm = ChatOllama(model=local_llm, temperature=0)

In [46]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an expert in road weather in the nordics. Your task is to guidance to interpret documents 
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)