### **STEP A — Install Dependencies**

In [4]:
!pip install -q langchain langchain-community langchain-text-splitters pypdf sentence-transformers faiss-cpu


### **STEP B — Upload your PDF rulebook**

In [5]:
from google.colab import files

print("Upload your Medical PDF:")
uploaded = files.upload()

pdf_path = list(uploaded.keys())[0]
pdf_path


Upload your Medical PDF:


Saving guideline-170-en.pdf to guideline-170-en (2).pdf


'guideline-170-en (2).pdf'

### **STEP C — Load the PDF (Auto-OCR if needed)**

In [6]:
!pip install unstructured
!pip install "unstructured[pdf]"
!pip install pdfminer.six
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [7]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdf_path)
pages = loader.load()

print(len(pages))
print(pages[0].page_content[:1500])


409
Page 1 / 409
Exported on: 30/01/2025
Clinical guidelines - Diagnosis and treatmentmanual
For curative programmes in hospitals and dispensariesGuidance for prescribing
 
 
© Médecins Sans Frontières
All rights reserved for all countries. No reproduction, translation and adaptation may be done without the prior
permission of the Copyright owner.
We're working to ﬁx some technical issues in the MSF Guidelines app affecting ofﬂine access ×


### **STEP D — Create Chunks Using Latest LangChain Splitter**

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " "]
)

chunks = text_splitter.split_documents(pages)

print("Total chunks generated:", len(chunks))


Total chunks generated: 978


### **STEP E — Save Chunks to Google Drive**

In [9]:
import os

kb_dir = "/content/drive/MyDrive/medical_kb_chunks"
os.makedirs(kb_dir, exist_ok=True)

for i, chunk in enumerate(chunks):
    with open(f"/content/drive/MyDrive/medical_kb_chunks/chunk_{i}.txt", "w", encoding="utf-8") as f:
        f.write(chunk.page_content)


print("Chunks saved to:", kb_dir)


Chunks saved to: /content/drive/MyDrive/medical_kb_chunks


### **STEP F — Save All Chunks into One JSON File**

In [10]:
import json

json_path = "/content/drive/MyDrive/medical_kb.json"
json.dump([chunk.page_content for chunk in chunks], open(json_path, "w"))

print("All chunks saved in:", json_path)


All chunks saved in: /content/drive/MyDrive/medical_kb.json


### **STEP G — Verify KB Loaded Correctly**

In [19]:
import re
import glob

def natural_sort(file_list):
    return sorted(file_list, key=lambda x: int(re.findall(r'\d+', x)[-1]))

files = natural_sort(glob.glob("/content/drive/MyDrive/medical_kb_chunks/*.txt"))
files[:10]   # show first 10 files


['/content/drive/MyDrive/medical_kb_chunks/chunk_0.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_1.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_2.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_3.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_4.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_5.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_6.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_7.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_8.txt',
 '/content/drive/MyDrive/medical_kb_chunks/chunk_9.txt']

In [18]:
from google.colab import drive
drive.mount('/content/drive')


ValueError: Mountpoint must not already contain files

In [21]:
!mkdir -p /content/drive/MyDrive/medical_kb_v2


In [22]:
for i, chunk in enumerate(chunks):
    path = f"/content/drive/MyDrive/medical_kb_v2/chunk_{i}.txt"
    with open(path, "w", encoding="utf-8") as f:
        f.write(chunk.page_content)


In [23]:
!ls -l /content/drive/MyDrive/medical_kb_v2 | head


total 3912
-rw-r--r-- 1 root root  449 Dec  2 12:02 chunk_0.txt
-rw-r--r-- 1 root root  992 Dec  2 12:02 chunk_100.txt
-rw-r--r-- 1 root root  932 Dec  2 12:02 chunk_101.txt
-rw-r--r-- 1 root root  572 Dec  2 12:02 chunk_102.txt
-rw-r--r-- 1 root root  927 Dec  2 12:02 chunk_103.txt
-rw-r--r-- 1 root root  532 Dec  2 12:02 chunk_104.txt
-rw-r--r-- 1 root root  990 Dec  2 12:02 chunk_105.txt
-rw-r--r-- 1 root root  976 Dec  2 12:02 chunk_106.txt
-rw-r--r-- 1 root root 1018 Dec  2 12:02 chunk_107.txt


In [None]:
!mkdir -p /content/drive/MyDrive/medical_kb_chunks


In [14]:
for i, chunk in enumerate(chunks):
    path = f"/content/drive/MyDrive/medical_kb_chunks/chunk_{i}.txt"
    with open(path, "w", encoding="utf-8") as f:
        f.write(chunk.page_content)


In [20]:
!ls -l /content/drive/MyDrive/medical_kb_chunks


total 3912
-rw-r--r-- 1 root root  449 Dec  2 11:36 chunk_0.txt
-rw-r--r-- 1 root root  992 Dec  2 11:36 chunk_100.txt
-rw-r--r-- 1 root root  932 Dec  2 11:36 chunk_101.txt
-rw-r--r-- 1 root root  572 Dec  2 11:36 chunk_102.txt
-rw-r--r-- 1 root root  927 Dec  2 11:36 chunk_103.txt
-rw-r--r-- 1 root root  532 Dec  2 11:36 chunk_104.txt
-rw-r--r-- 1 root root  990 Dec  2 11:36 chunk_105.txt
-rw-r--r-- 1 root root  976 Dec  2 11:36 chunk_106.txt
-rw-r--r-- 1 root root 1018 Dec  2 11:36 chunk_107.txt
-rw-r--r-- 1 root root  771 Dec  2 11:36 chunk_108.txt
-rw-r--r-- 1 root root 1009 Dec  2 11:36 chunk_109.txt
-rw-r--r-- 1 root root  228 Dec  2 11:36 chunk_10.txt
-rw-r--r-- 1 root root  955 Dec  2 11:36 chunk_110.txt
-rw-r--r-- 1 root root  682 Dec  2 11:36 chunk_111.txt
-rw-r--r-- 1 root root  906 Dec  2 11:36 chunk_112.txt
-rw-r--r-- 1 root root  992 Dec  2 11:36 chunk_113.txt
-rw-r--r-- 1 root root  751 Dec  2 11:36 chunk_114.txt
-rw-r--r-- 1 root root  696 Dec  2 11:36 chunk_115.txt
-r