In [1]:
import glob
from pathlib import Path

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PDF_FOLDER   = Path("../data/The Godfather Summary.pdf")      # <-- change this
EMBED_MODEL  = "intfloat/e5-mistral-7b-instruct"
# GEN_MODEL    = "mistralai/Mistral-7B-Instruct-v0.3"
GEN_MODEL    = "mistral-3b"
CHUNK_SIZE   = 1000
CHUNK_OVERLAP= 200

In [3]:
def load_pdfs(pdf_path: Path):
    docs = []
    # Accept a single file or a folder
    if pdf_path.is_file():
        paths = [pdf_path]
    else:
        paths = list(pdf_path.glob("*.pdf"))

    for p in paths:
        loader = PyPDFLoader(str(p))          # <-- PyPDFLoader in action
        docs.extend(loader.load())
    return docs

raw_docs = load_pdfs(PDF_FOLDER)
print(f"Loaded {len(raw_docs)} pages from {PDF_FOLDER}")

Loaded 3 pages from ../data/The Godfather Summary.pdf


In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)
chunks = text_splitter.split_documents(raw_docs)
print(f"Created {len(chunks)} chunks")

Created 15 chunks


In [None]:
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})  # top-6 chunks

Fetching 2 files: 100%|██████████| 2/2 [03:35<00:00, 108.00s/it]
Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    # device_map="auto",          # uses GPU if available, else CPU
    torch_dtype="auto",
    trust_remote_code=True,
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature=0.2,
    top_p=0.95,
    do_sample=True,
)
llm = HuggingFacePipeline(pipeline=pipe)


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 38.79it/s]
Device set to use mps:0


RuntimeError: MPS backend out of memory (MPS allocated: 9.04 GiB, other allocations: 656.00 KiB, max allowed: 9.07 GiB). Tried to allocate 1.02 GiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [23]:
template = """You are a helpful assistant. Answer the question using ONLY the provided context.
If the context does not contain the answer, say "I don't know".

Context:
{context}

Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)

In [24]:
def format_context(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_context, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

NameError: name 'llm' is not defined

In [None]:
question = "What is the main conclusion of the paper about climate change?"
answer = rag_chain.invoke(question)
print("\n--- ANSWER ---")
print(answer)