In [1]:
import os
from pathlib import Path

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Hugging Face API key from environment variable
HUGGING_FACE_API = os.environ.get("HUGGING_FACE_API", None)
if not HUGGING_FACE_API:
    print("Warning: HUGGING_FACE_API environment variable not set")

In [3]:
PDF_FOLDER = Path("../data/The Godfather Summary.pdf")      # <-- change this
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

In [4]:
def load_pdfs(pdf_path: Path):
    docs = []
    # Accept a single file or a folder
    if pdf_path.is_file():
        paths = [pdf_path]
    else:
        paths = list(pdf_path.glob("*.pdf"))

    for p in paths:
        loader = PyPDFLoader(str(p))          # <-- PyPDFLoader in action
        docs.extend(loader.load())
    return docs


raw_docs = load_pdfs(PDF_FOLDER)
print(f"Loaded {len(raw_docs)} pages from {PDF_FOLDER}")

Loaded 3 pages from ../data/The Godfather Summary.pdf


In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)
chunks = text_splitter.split_documents(raw_docs)
print(f"Created {len(chunks)} chunks")

Created 15 chunks


In [6]:
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})  # top-6 chunks

In [7]:
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    device_map="auto",          # uses GPU if available, else CPU
    torch_dtype="float32",
    trust_remote_code=True,
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature=0.2,
    top_p=0.95,
    do_sample=True,
)
llm = HuggingFacePipeline(pipeline=pipe)

`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 3 files: 100%|██████████| 3/3 [03:52<00:00, 77.54s/it] 
Loading checkpoint shards: 100%|██████████| 3/3 [00:14<00:00,  4.95s/it]
Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


In [8]:
template = """You are a helpful assistant. Answer the question using ONLY the provided context.
If the context does not contain the answer, say "I don't know".

Context:
{context}

Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)

In [9]:
def format_context(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_context, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [10]:
question = "What is the main conclusion of the paper about climate change?"
answer = rag_chain.invoke(question)
print("\n--- ANSWER ---")
print(answer)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- ANSWER ---
You are a helpful assistant. Answer the question using ONLY the provided context.
If the context does not contain the answer, say "I don't know".

Context:
Michael Sragow is a film critic and columnist who has 
written for “The Orange County Register,” “The Baltimore 
Sun,” “The San Francisco Examiner,” “The New Times,” “The 
New  Yorker” (where he worked with Pauline Kael), “The 
Atlantic” and salon.com. Sragow also edited James Agee's 
film essays (for the book “Agee on Film”), and has written or 
contributed to several other cinema-related books.  
The views expressed in these essays are those of the author and do 
not necessarily represent the views of the Library of Congress.  
Robert De Niro’s young Vito has the same careful intelli-
gence, focused warmth, and regal bearing as Brando’s 
Don. But he’s a lithe young man with a smidgen of naïve 
enthusiasm. When he and his partner take proprietary 
pride in their olive-oil company front, they look as de-
lighted as a