In [3]:
import pathlib

PDF_DIR = pathlib.Path("data/raw")
pdf_paths = sorted(PDF_DIR.glob("*.pdf"))
assert pdf_paths, "No PDFs found - check path"

from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader
from langchain.docstore.document import Document
from tqdm import tqdm

source_docs = []

for path in tqdm(pdf_paths, desc="Reading PDFs"):
    try:
        loader = PyPDFLoader(str(path))
        pages = list(loader.lazy_load())
    except Exception as e:
        print(f"Failed at {path.name}: {e}")
        continue
    for p in pages:
        # Unstructured returns one Document per page by default
        p.metadata["source"] = path.stem                 # short source name
        p.metadata["page_num"] = p.metadata.get("page")  # keep original page index
    source_docs.extend(pages)

Reading PDFs: 100%|██████████| 13/13 [00:11<00:00,  1.17it/s]


In [4]:
import re

for d in source_docs :
    txt = d.page_content
    txt = re.sub(r"\s+\n", "\n", txt)
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    d.page_content = txt.strip()

In [11]:
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

spliiter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    AutoTokenizer.from_pretrained("thenlper/gte-small"),
    chunk_size = 200,
    chunk_overlap=20,
    add_start_index = True,
    strip_whitespace = True
)

print("Splitting and deduplicating...")

docs_processed, seen = [], set()

for doc in tqdm(source_docs) :
    for chunk in spliiter.split_documents([doc]):
        if chunk.page_content not in seen:
            seen.add(chunk.page_content)
            docs_processed.append(chunk)

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Splitting and deduplicating...


100%|██████████| 432/432 [00:03<00:00, 137.62it/s]


The above block is used to process raw text content into FAISS's expected format.

In [None]:
## SANITY CHECK - MAKE SURE THAT OUR CODE WORKED AND DOCUMENTS WERE CREATED

doc = docs_processed[15]
print(type(doc))
print(doc.metadata)
print(doc.page_content[:1000])

<class 'langchain_core.documents.base.Document'>
{'producer': 'iLovePDF', 'creator': 'Acrobat PDFMaker 11 for Word', 'creationdate': '2023-12-06T15:01:50+05:30', 'author': 'Amol Dighe', 'company': '', 'sourcemodified': 'D:20231206092752', 'subject': 'A Roadmap prepared by the Indian Nuclear Physics Communitywith TIFR, Mumbai as the Nodal Scientific Institution', 'title': 'Mega Science Vision – 2035   Nuclear Physics', 'moddate': '2024-01-24T10:12:18+00:00', 'source': 'DST - MSV2035-NP-Final', 'total_pages': 140, 'page': 14, 'page_label': '15', 'page_num': 14, 'start_index': 0}
MEGA SCIENCE VISION – 2035   NUCLEAR PHYSICS
3
THE DRAFTING AND WO RKING GROUPS
Director TIFR, Mumbai –
Dr. Jayaram Chengalur / Dr. S. Ramakrishnan / Dr. Sandip Trivedi Chairperson
Members from the D rafting Group
Dr. Alphonsa Joseph Palakkel, IPR, Gandhinagar Member
Dr. Aradhana Srivastava, BARC, Mumbai Member
Dr. Bedangadas Mohanty, NISER, Bhubaneswar Member
Dr. Rudrajyoti Palit, TIFR, Mumbai Member
Other exper

In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.utils import DistanceStrategy

embed = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

vector_db = FAISS.from_documents(docs_processed, embedding=embed, distance_strategy = DistanceStrategy.COSINE)

FAISS.save_local(vector_db, "data/processed/faiss_index")

  embed = HuggingFaceEmbeddings(model_name="thenlper/gte-small")


ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.