In [1]:
import json
from typing import List
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#CONFIG
JSONL_PATH = r"/home/abk/abk/projects/Major-project-basic-ui/backend/taxonomy_data/merged_taxonomic_chunks.jsonl"
DB_FAISS_PATH = "/home/abk/abk/projects/Major-project-basic-ui/backend/vectorstore"

In [3]:
def load_jsonl_documents(jsonl_path: str) -> List[Document]:
    documents = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)


            species = data.get("species_name", "Unknown Species")
            genus = data.get("genus", "Unknown Genus")
            section = data.get("section", "General Info")
            authority = data.get("authority", "")
            raw_text = data.get("text", "").strip()

            if not raw_text:
                continue

            enriched_text = (
                f"TAXONOMIC DATA FOR: {species} (Genus: {genus})\n"
                f"SECTION: {section}\n"
                f"AUTHORITY: {authority}\n"
                f"DESCRIPTION AND KEY FEATURES:\n{raw_text}\n"
                f"IDENTIFICATION SUMMARY: This chunk describes the morphological characters of {species}."
            )

            metadata = {
                "chunk_type": data.get("chunk_type"),
                "species_name": species,
                "genus": genus,
                "section": section,
                "title": data.get("title"),
                "authority": authority,
                "year": data.get("year"),
                "source_file": data.get("source_file"),
                "paragraph_span": data.get("paragraph_span"),
            }


            documents.append(Document(page_content=enriched_text, metadata=metadata))
    return documents

In [4]:
def create_vector_db():
    print("Loading JSONL chunks...")
    documents = load_jsonl_documents(JSONL_PATH)
    print("Total documents:", len(documents))

    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={"device": "cuda" },
        encode_kwargs={"normalize_embeddings": True}
    )

    print("Creating FAISS index...")
    db = FAISS.from_documents(documents, embeddings)

    db.save_local(DB_FAISS_PATH)
    print("FAISS vectorstore saved at:", DB_FAISS_PATH)

In [None]:
if __name__ == "__main__":
    create_vector_db()