# Manual generator with LLM

This is a POC for the manual and documentation division. Please beware the following strategy:

1. Get the document using the confluence connector
2. Process the document with PyMuPDF4LLM
3. Tag and divide the document, export a pdf file with the processed data for reference.
4. Create relevant rules to follow (due to Bedrock limitations, a full LLM cannot be used, only Prompt based approach)

In [70]:
from requests.auth import HTTPBasicAuth
from requests import get
from os import environ
from typing import Optional
from dotenv import load_dotenv

env = environ.get("environ")
if env is None or env.lower() != "prod":
    load_dotenv()

class ConfluenceClient():
    _headers = {
      "Accept": "application/json"
    }
    '''Serves as a client for some key confluence functions.
    
    :param wiki_url: str, the wiki url
    :param username: str, optional
    :param token: str, optional
    '''
    def __init__(self, 
                 wiki_url: str, 
                 username: Optional[str]=environ["CONFLUENCE_USER"], 
                 token: Optional[str]=environ["CONFLUENCE_TOKEN"]):
        self._url = wiki_url
        self._auth = HTTPBasicAuth(username, token)

    def _build_endpoint(self, endpoint: str):
        return f"{self._url}/api/v2{endpoint}"
    
    async def get_spaces(self):
        '''
        Get all spaces from the confluence wiki
        '''
        endpoint = self._build_endpoint("/spaces")
        response = get(
            endpoint,
            headers=self._headers,
            auth=self._auth
        )

        return response.json()

    async def get_page_by_id(self,
                            id: str):
        endpoint = self._build_endpoint(f"/pages/{id}")
        response = get(
            endpoint,
            headers=self._headers,
            auth=self._auth
        )

        return response.json()

    async def get_attachments_for_page(self, page_id: str, media_type: Optional[str] = None):
        endpoint = self._build_endpoint(f"/pages/{page_id}/attachments")
        params = {
            "mediaType": media_type
        }
        response = get(
            endpoint,
            params=params,
            headers=self._headers,
            auth=self._auth
        )

        return response.json()

    async def get_attachment_content(self, download_url: str):
        response = get(
            self._url + download_url,
            auth=self._auth,
            headers=self._headers
        ) 
        return response.content

In [75]:
from pymupdf4llm import to_markdown
from pymupdf import Document

async def convert(doc, pages):
    return to_markdown(
        doc,
        pages=pages)

In [79]:
from pathlib import Path

# Tagging/Indexing phase
WIKI_URL = r"https://gpd-dom.atlassian.net/wiki"
NAME_SPECS = ["language", "words"]

client = ConfluenceClient(WIKI_URL)

attachments = (await client.get_attachments_for_page("114906206", "application/pdf")).get("results")
urls = ((att["title"], att["downloadLink"]) for att in attachments if any(keyword in att["title"].lower() for keyword in NAME_SPECS))

async def download_files(urls):
    for title, url in urls:
        content = await client.get_attachment_content(url)
        Path(f"pdf/{title}").write_bytes(content)

async def save_md(path, title, pages):
    md = await convert(path, pages)
    Path(f"md/{title}.md").write_bytes(md.encode())

for title, url in urls:
    await save_md(f"pdf/{title}", title, None)