This notebook provides example code for creating a basic RAG system using the LangChain framework.

In [29]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

import re

In [22]:

# Step 1: Load a PDF and extract text
pdf_path = "data/div-class-title-relaxing-assumptions-improving-inference-integrating-machine-learning-and-the-linear-regression-div.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load()

# Step 2: Split text into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(pages)

# Step 3: Embed the documents using a HuggingFace model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4: Store them in a Chroma vector store for retrieval
vectorstore = Chroma(
    collection_name="collection",
    embedding_function=embedding_model,
    persist_directory="./data/chroma_langchain_db",  # Where to save data locally, remove if not necessary
)
vectorstore.add_documents(documents=documents)

# Step 5: Load a local LLM from Hugging Face
model_id = "Gensyn/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# Create a generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)

# Wrap in LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=generator)

# Step 6: Set up Retrieval-Augmented Generation (RAG)
# Define a simple prompt
prompt_template = """You are an academic assistant. Use the following context to answer the user's question in a concise and helpful way.
If you don't know the answer, say "I don't know."

Context:
{context}

Question:
{question}

Answer:"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    chain_type="stuff", 
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)


Device set to use cuda:0


In [None]:
# Step 7: Ask questions based on PDF content
query = "Summarize PLCE"
print("Question:", query)
result = qa_chain.invoke({"query": query})

# parse out actual answer from LLM response with regex
raw_output = result["result"]
match = re.search(r"Answer:\s*(.*)", raw_output, re.DOTALL)
answer_text = match.group(1).strip()

# Output result
print("Answer:", answer_text)

Question: Summarize PLCE
Answer: The Partially Linear Causal Effect (PLCE) model is an extension of the partially linear model, incorporating exogenous interference, heteroskedasticity in the treatment assignment mechanism, and random effects. Under causal assumptions, it provides a causal estimate of the treatment on the outcome. The model includes equations for the treatment and outcome models, with specific conditions on the error terms. It differs from other econometric methods, such as the Generalized Method of Moments (GMM) or the Generalized Empirical Likelihood (GEL). The proposed model has been used to analyze racial threat distance turnout data. Overall, the PLCE model offers a flexible framework for causal analysis in situations involving complex interactions between variables.
