In [1]:
!pip install langchain langchain-community faiss-cpu pypdf requests fitz ollama sentence-transformers pymupdf

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,

In [11]:
import os
import subprocess
import fitz  # PyMuPDF
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import shutil

# Install required packages if needed
try:
    import sentence_transformers
except ImportError:
    print("Installing sentence_transformers...")
    subprocess.run(["pip", "install", "sentence-transformers"], check=True)

def extract_pdf_with_pymupdf(pdf_path):
    """
    Extract text and metadata from a PDF file using PyMuPDF (fitz)

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        list: List of Document objects
    """
    documents = []

    # Open the PDF
    pdf_document = fitz.open(pdf_path)

    print(f"PDF has {len(pdf_document)} pages")

    # Extract text from each page
    for page_num, page in enumerate(pdf_document):
        # Extract text
        text = page.get_text()

        # Skip empty pages
        if not text.strip():
            continue

        # Create metadata
        metadata = {
            "source": pdf_path,
            "page": page_num + 1,
            "total_pages": len(pdf_document)
        }

        # Try to extract any images and their captions (simplified)
        image_list = page.get_images(full=True)
        if image_list:
            metadata["has_images"] = True
            metadata["image_count"] = len(image_list)

        # Create a Document object
        doc = Document(page_content=text, metadata=metadata)
        documents.append(doc)

    # Close the PDF
    pdf_document.close()

    return documents

def embed_pdf_to_vector_db(pdf_path, embedding_model="nomic-ai/nomic-embed-text-v1", output_folder="pdf_vector_db"):
    """
    Convert a PDF file to vector embeddings and store in a FAISS database

    Args:
        pdf_path (str): Path to the PDF file
        embedding_model (str): HuggingFace model to use for embeddings
        output_folder (str): Folder to save the vector database

    Returns:
        str: Path to the created vector database folder
    """
    print(f"Loading PDF from {pdf_path}...")

    # Extract text using PyMuPDF
    documents = extract_pdf_with_pymupdf(pdf_path)

    print(f"Extracted text from {len(documents)} pages")

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    chunks = text_splitter.split_documents(documents)

    print(f"Split into {len(chunks)} chunks for processing")

    # Create embeddings
    print(f"Creating embeddings using {embedding_model}...")
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

    # Create vector database
    print("Building vector database...")
    vector_db = FAISS.from_documents(chunks, embeddings)

    # Save the vector database locally
    print(f"Saving vector database to {output_folder}...")
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
    vector_db.save_local(output_folder)

    # Save embedding model information
    with open(os.path.join(output_folder, "embedding_info.txt"), "w") as f:
        f.write(f"embedding_model: {embedding_model}\n")

    # Save PDF metadata
    pdf_document = fitz.open(pdf_path)
    metadata = pdf_document.metadata
    if metadata:
        with open(os.path.join(output_folder, "pdf_metadata.txt"), "w") as f:
            for key, value in metadata.items():
                f.write(f"{key}: {value}\n")
    pdf_document.close()

    print(f"Vector database created successfully at {output_folder}")
    return output_folder

def share_vector_db(db_path, output_zip="vector_database.zip"):
    """
    Create a ZIP file of the vector database for sharing

    Args:
        db_path (str): Path to the vector database folder
        output_zip (str): Output ZIP file name

    Returns:
        str: Path to the created ZIP file
    """
    print(f"Creating ZIP archive of vector database...")
    shutil.make_archive(
        os.path.splitext(output_zip)[0],  # Remove .zip extension for make_archive
        'zip',
        db_path
    )
    print(f"ZIP archive created at {output_zip}")
    return output_zip

def download_from_colab(file_path):
    """
    Generate code to download a file from Google Colab

    Args:
        file_path (str): Path to the file to download
    """
    try:
        from google.colab import files
        files.download(file_path)
        print(f"Downloading {file_path} to your local machine...")
    except ImportError:
        print(f"File is ready at: {file_path}")
        print("You're not running in Google Colab, so automatic download isn't available.")
        print("Please use the file browser to download the ZIP file.")

def main():
    pdf_path = input("Enter the path to your PDF file: ")

    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' not found.")
        return

    # Install required packages
    try:
        import pymupdf
    except ImportError:
        print("Installing pymupdf...")
        subprocess.run(["pip", "install", "pymupdf"], check=True)

    try:
        import faiss
    except ImportError:
        print("Installing faiss-cpu...")
        subprocess.run(["pip", "install", "faiss-cpu"], check=True)

    # Ensure langchain packages are installed
    for package in ["langchain", "langchain-community"]:
        try:
            __import__(package.replace("-", "_"))
        except ImportError:
            print(f"Installing {package}...")
            subprocess.run(["pip", "install", package], check=True)

    # Available embedding models
    models = [
        "nomic-ai/nomic-embed-text-v1",  # Recommended default
        "sentence-transformers/all-mpnet-base-v2",  # Solid alternative
        "sentence-transformers/all-MiniLM-L6-v2"  # Faster, lighter option
    ]

    print("\nAvailable embedding models:")
    for i, model in enumerate(models, 1):
        if i == 1:
            print(f"{i}. {model} (recommended)")
        else:
            print(f"{i}. {model}")

    idx = input(f"Select model (1-{len(models)}) [default=1]: ").strip() or "1"

    if idx.isdigit() and 1 <= int(idx) <= len(models):
        embedding_model = models[int(idx) - 1]
    else:
        embedding_model = models[0]
        print(f"Invalid selection. Using default model: {embedding_model}")

    db_path = embed_pdf_to_vector_db(pdf_path, embedding_model)
    if not db_path:
        return

    zip_path = share_vector_db(db_path)

    # In Colab, offer automatic download
    download_from_colab(zip_path)

    print("\nInstructions for your friend:")
    print("1. Download and install Python if not already installed")
    print("2. Run: pip install langchain langchain-community faiss-cpu pymupdf sentence-transformers")
    print("3. Extract the ZIP file")
    print("4. Use the following code to load and query the vector database:")

    query_example = f"""
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load the vector database
embeddings = HuggingFaceEmbeddings(model_name="{embedding_model}")
vector_db = FAISS.load_local("extracted_folder_path", embeddings)

# Query the database
query = "What is this document about?"
docs = vector_db.similarity_search(query, k=3)

# Print results
for i, doc in enumerate(docs):
    print(f"Result {{i+1}}:\\n{{doc.page_content}}\\n")
    print(f"Source: {{doc.metadata.get('source')}}, Page: {{doc.metadata.get('page')}}")
"""
    print(query_example)

if __name__ == "__main__":
    main()

Enter the path to your PDF file: /content/Maternal and Child Health Nursing.pdf

Available embedding models:
1. nomic-ai/nomic-embed-text-v1 (recommended)
2. sentence-transformers/all-mpnet-base-v2
3. sentence-transformers/all-MiniLM-L6-v2
Select model (1-3) [default=1]: 3
Loading PDF from /content/Maternal and Child Health Nursing.pdf...
PDF has 1811 pages
Extracted text from 1808 pages
Split into 10307 chunks for processing
Creating embeddings using sentence-transformers/all-MiniLM-L6-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Building vector database...
Saving vector database to pdf_vector_db...
Vector database created successfully at pdf_vector_db
Creating ZIP archive of vector database...
ZIP archive created at vector_database.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading vector_database.zip to your local machine...

Instructions for your friend:
1. Download and install Python if not already installed
2. Run: pip install langchain langchain-community faiss-cpu pymupdf sentence-transformers
3. Extract the ZIP file
4. Use the following code to load and query the vector database:

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Load the vector database
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.load_local("extracted_folder_path", embeddings)

# Query the database
query = "What is this document about?"
docs = vector_db.similarity_search(query, k=3)

# Print results
for i, doc in enumerate(docs):
    print(f"Result {i+1}:\n{doc.page_content}\n")
    print(f"Source: {doc.metadata.get('source')}, Page: {doc.metadata.get('page')}")

