In [1]:
!pip -q install langchain huggingface_hub openai chromadb tiktoken faiss-cpu
!pip -q install sentence_transformers langchain_community
!pip -q install -U FlagEmbedding
!pip -q install -U transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:

In [4]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-5.8.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.8.0


### Mini LLM-Powered Question-Answering System Using RAG
#### This notebook implements a Retrieval-Augmented Generation (RAG) system to answer queries from a PDF document (e.g., clinical guidelines).
- **Objective**: Build a functional prototype in 4 hours, handling PDF ingestion, chunking, embedding, vector storage, retrieval, and LLM-based answering.
- **Fix**: Uses PyPDFLoader to extract PDF text, fixing WebBaseLoader issue (len(docs) = 0).
- **Update**: Uses BAAI/bge-small-en-v1.5 embeddings, Chroma vector store, and create_retrieval_chain with custom ChatPromptTemplate, replacing Astra and ChatGroq.
- **Components**: PDF ingestion (PyPDFLoader), chunking (RecursiveCharacterTextSplitter), embeddings (BAAI/bge-small-en-v1.5), Chroma vector store, Zephyr-7B LLM, command-line interface.
- **Test Queries**:
  - "Give me the correct coded classification for the following diagnosis: Recurrent depressive disorder, currently in remission"
  - "What are the diagnostic criteria for Obsessive-Compulsive Disorder (OCD)?"


### 1. Import Libraries
#### Import libraries for PDF processing, chunking, embeddings, vector storage, LLM, and RAG pipeline.
- langchain_community.document_loaders: PyPDFLoader for PDF text extraction.
- langchain.text_splitter: RecursiveCharacterTextSplitter for chunking.
- langchain_chroma: Chroma for vector storage.
- langchain.embeddings: HuggingFaceBgeEmbeddings for BAAI/bge-small-en-v1.5.
- transformers: Zephyr-7B LLM.
- langchain_core: create_retrieval_chain and ChatPromptTemplate for RAG.


In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
import torch
from pathlib import Path

### 2. Document Ingestion and Chunking
#### Load PDF using PyPDFLoader and split into chunks of ~1000 characters with 200-character overlap.
- **Why**: PyPDFLoader extracts text directly from PDFs, fixing WebBaseLoader issue. 1000-character chunks balance context and efficiency.
#### Load PDF

In [2]:
pdf_path = '/content/9241544228_eng.pdf'
loader = PyPDFLoader(pdf_path)
text_documents = loader.load()
print(f"Loaded {len(text_documents)} pages from PDF.")

Loaded 377 pages from PDF.


In [3]:
# Split Documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=400,
    length_function=len
)
docs = text_splitter.split_documents(text_documents)

In [4]:
len(docs)

374

In [5]:
docs[90]

Document(metadata={'producer': '', 'creator': 'ABBYY FineReader', 'creationdate': '2008-04-22T16:56:33+00:00', 'author': 'World Health Organization', 'keywords': 'Mental disorders — classification; Mental disorders — diagnosis', 'moddate': '2015-08-24T13:47:58+02:00', 'subject': 'mental disorders', 'title': 'The ICD-10 Classification of Mental and Behavioural Disorders: Clinical descriptions and diagnostic guidelines', 'source': '/content/9241544228_eng.pdf', 'total_pages': 377, 'page': 92, 'page_label': '80'}, page_content='MENTAL AND BEHAVIOURAL DISORDERS\nmescaline, cannabis at high doses) have been taken. In such cases,\nand also for confusional states, a possible diagnosis of acute\nintoxication (Flx.O) should be considered.\nParticular care should also be taken to avoid mistakenly diagnosing\na more serious condition (e.g. schizophrenia) when a diagnosis of\npsychoactive substance-induced psychosis is appropriate. Many\npsychoactive substance-induced psychotic states are of short

### 3. Embedding and Vector Store
#### Generate embeddings using BAAI/bge-small-en-v1.5 and store in Chroma.
- **Why**: BGE embeddings offer high-quality semantic similarity; Chroma provides persistent storage.
- **Top-k**: Retrieve 3 chunks to balance context and LLM input limits.


In [6]:
# Initalize embedding
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs=encode_kwargs
)


  bge_embeddings = HuggingFaceBgeEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Create Chroma Vectorstore
vector_store = Chroma.from_documents(
    documents=docs,
    embedding=bge_embeddings,
    persist_directory='./chroma_db'
)
vector_store.persist()
print("Vector store created")

Vector store created


  vector_store.persist()


### 4. LLM Integration
#### Load Zephyr-7B LLM and integrate with LangChain for RAG.
- **Why**: Zephyr-7B is open-source, suitable for local execution.
- **Setup**: Use HuggingFace pipeline wrapped in LangChain, with create_retrieval_chain for custom prompt.


In [8]:
# Load LLM and tokenizer
model_name = 'HuggingFaceH4/zephyr-7b-alpha'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map='auto')

# Create text generation pipeline
hf_pipeline = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95
)

# Wrap in LangChain LLM
llm = HuggingFacePipeline(pipeline=hf_pipeline)

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [9]:
# Define custom prompt
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.

{context}

Question: {input}
""")

In [10]:
# Create RAG chain
retriever = vector_store.as_retriever(search_kwargs={'k': 3})
document_chain = create_stuff_documents_chain(llm, prompt)
qa_chain = create_retrieval_chain(retriever, document_chain)
print("RAG pipeline initialized.")

RAG pipeline initialized.


### 5. Query Interface
#### Implement a command-line interface to answer queries.
- **Process**: Retrieve top-3 chunks, pass to LLM with custom prompt, and save answers with sources.


In [11]:
def answer_query(query):
    """Answer a query using the RAG pipeline."""
    result = qa_chain.invoke({"input": query})
    answer = result['answer']
    sources = [doc.page_content[:100] + '...' for doc in result['context']]
    return answer, sources

In [12]:
# Test queries
test_queries = [
    "Give me the correct coded classification for the following diagnosis: Recurrent depressive disorder, currently in remission",
    "What are the diagnostic criteria for Obsessive-Compulsive Disorder (OCD)?"
]

In [13]:
# Process and save answers
output_dir = Path('output')
output_dir.mkdir(exist_ok=True)
with open(output_dir / 'answers.txt', 'w', encoding='utf-8') as f:
    for query in test_queries:
        answer, sources = answer_query(query)
        f.write(f"Query: {query}\nAnswer: {answer}\nSources:\n")
        for i, source in enumerate(sources, 1):
            f.write(f"{i}. {source}\n")
        f.write("\n")
        print(f"Query: {query}\nAnswer: {answer}\nSources: {', '.join(sources)}\n")

print("Answers saved to output/answers.txt")

Query: Give me the correct coded classification for the following diagnosis: Recurrent depressive disorder, currently in remission
Answer: Human: 
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.

MENTAL AND BEHAVIOURAL DISORDERS
F33.4 Recurrent depressive disorder, currently in remission
Diagnostic guidelines
For a definite diagnosis:
(a) the criteria for recurrent depressive disorder (F33. -) should have
been fulfilled in the past, but the current state should not fulfil
the criteria for depressive episode of any degree of severity or
for any other disorder in F30-F39; and
(b) at least two episodes should have lasted a minimum of 2 weeks
and should have been separated by several months without
significant mood disturbance.
Otherwise the diagnosis should be other recurrent mood [affective]
disorder (F38.1).
This category can still be used if the patient is