# Convert pdf to markdown

In [1]:
import os
from pymupdf4llm import to_markdown
import pymupdf
import json
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

llm_model = ChatOpenAI(model="grok-beta", base_url="https://api.x.ai/v1", api_key=os.getenv("XAI_API_KEY"))

load_dotenv()

True

In [2]:
from pathlib import Path


def get_first_page_image(pdf_path: str):
    pdf_document = pymupdf.open(pdf_path)
    pixmap = pdf_document[0].get_pixmap()

    return pixmap.tobytes()


def convert_to_markdown(pdf_path: str, write_to_file: bool = False) -> str:
    temp_path = Path(pdf_path)
    output_dir = temp_path.parent
    if temp_path.exists() and temp_path.suffix == ".pdf":
        file_name = temp_path.stem
    else:
        raise FileNotFoundError(f"File {pdf_path} does not exist or is not a PDF file")

    md_text = to_markdown(pdf_path)
    if write_to_file:
        with open(f"{output_dir}/{file_name}.md", "w", encoding="utf-8") as f:
            f.write(md_text)
    return md_text


In [3]:
# md_text = convert_to_markdown("docs/restructured.pdf")
first_page_image = get_first_page_image("docs/restraint.pdf")

In [None]:
from IPython.display import Image

Image(first_page_image, width=800)


In [23]:
import toml

with open("prompts/prompts.toml", "r") as f:
    prompts = toml.load(f)

essentials_prompt_template = prompts["extract_essentials"]["prompt"]
sections_prompt_template = prompts["extract_sections"]["prompt"]
section_details_prompt_template = prompts["extract_section_details"]["prompt"]
metadata_prompt_template = prompts["extract_metadata"]["prompt"]


In [24]:
metadata_prompt = ChatPromptTemplate.from_messages([
    ("system", metadata_prompt_template),
    ("user", "{first_page_image}"),
])

essentials_prompt = ChatPromptTemplate.from_messages([
    ("system", essentials_prompt_template),
    ("user", "{text}"),
])

sections_prompt = ChatPromptTemplate.from_messages([
    ("system", sections_prompt_template),
    ("user", "{text}"),
])

section_details_prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        section_details_prompt_template,
    ),
    ("user", "{text}"),
])


metadata_chain = metadata_prompt | llm_model | StrOutputParser()
essentials_chain = essentials_prompt | llm_model | StrOutputParser()
sections_chain = sections_prompt | llm_model | StrOutputParser()
section_details_chain = section_details_prompt | llm_model | StrOutputParser()

## Extract title, authors, affilication from the first page

In [None]:
def extract_metadata(first_page_image: bytes) -> dict:
    metadata = metadata_chain.invoke({"first_page_image": first_page_image})
    metadata = metadata.replace("```json", "").replace("```", "")
    return json.loads(metadata)


metadata = extract_metadata(first_page_image)
metadata
# Get summary

# Get summary

In [54]:
def quote_essentials(essentials: str) -> str:
    essentials_quoted = ""
    for split in essentials.split("\n\n"):
        essentials_quoted += f"> {split}\n\n"
    return essentials_quoted


def extract_essentials(text: str) -> str:
    essentials = essentials_chain.invoke({"text": text})
    return quote_essentials(essentials)


def extract_section_titles(text: str) -> list[str]:
    section_titles = sections_chain.invoke({"text": text}).split("\n")
    return section_titles


def extract_section_details(text, section_titles):
    section_details = []
    for section_title in section_titles:
        section_detail = section_details_chain.invoke({
            "text": text,
            "section_title": section_title,
        })
        section_details.append(f"## {section_title}\n\n{section_detail}")

    return section_details


In [55]:
def get_overview(pdf_path: str, write_to_file: bool = True) -> str:
    temp_path = Path(pdf_path)
    output_dir = temp_path.parent
    if temp_path.exists() and temp_path.suffix == ".pdf":
        file_name = temp_path.stem
    else:
        raise FileNotFoundError(f"File {pdf_path} does not exist or is not a PDF file")

    md_text = convert_to_markdown(pdf_path)
    metadata = extract_metadata(first_page_image)
    essentials = extract_essentials(md_text)
    section_titles = extract_section_titles(md_text)
    section_details = extract_section_details(md_text, section_titles)

    all_output = ""

    all_output += f"# {metadata['title']}\n\n"
    all_output += f"### {', '.join(metadata['authors'])}\n\n"
    all_output += f"#### {metadata['affiliation']}\n\n"
    all_output += f"{essentials}\n\n"
    all_output += "## Abstract\n\n"
    all_output += f"{metadata['abstract']}\n\n"
    all_output += "\n\n".join(section_details)
    if write_to_file:
        with open(f"{output_dir}/{file_name}.md", "w", encoding="utf-8") as f:
            f.write(all_output)

    return all_output


get_overview("docs/coercive.pdf", write_to_file=True)

In [None]:
type(metadata_chain)

In [4]:
from utils.process_image import extract_metadata

answer = extract_metadata(first_page_image)

Done!      |

In [5]:
answer


{'title': 'Exploring Psychiatric Patient Restraints: Balancing Safety, Ethics, and Patient Rights in Mental Healthcare',
 'authors': ['Fayaz Ahmed Paul',
  'Asim Ur Rehman Ganie',
  'Danishwar Rasool Dar',
  'Priyanka Saikia',
  'Indrajit Banerjee'],
 'affiliation': 'Department of Psychiatry: Social Work, LGB Regional Institute of Mental Health, Tezpur, Assam, India',
 'abstract': 'This paper examines the use of restraints in psychiatric care, balancing safety and ethical considerations with patient rights.'}

In [7]:
md_text = "abkjljlkjlkjlkjlkklklklkjlkjjk \n\n REFERECes \n\n 1. 2. 3. 4. 5. 6. 7. 8. 9. 10."

In [10]:
md_text = md_text[: md_text.lower().find("references")]


In [11]:
md_text

'abkjljlkjlkjlkjlkklklklkjlkjjk \n\n'