# Documentation helper

There are three main packages or domains in this solution: 

* Rules
* Dita/XML
* LLM

Linguistic rules tagging is proposed to verify the validity of the generated manual. 

In [3]:
%pip install -qU "langchain[aws]" python-dotenv lxml atlassian-python-api

from dotenv import load_dotenv
load_dotenv()

Note: you may need to restart the kernel to use updated packages.


True

In [6]:
import boto3
from botocore.config import Config

# Configure the Bedrock client with a read timeout of 3600 seconds
config = Config(read_timeout=3600, region_name="us-east-1")

bedrock_client = boto3.client("bedrock-runtime", config=config)

with open("json/rules.json") as file:
    rules = file.read()

with open("json/bad_words.json") as file:
    bad_words = file.read()

def base_prompt(job, rules, bad_words, instructions): return f"""Your job is to {job} based on the following
Rules:
{rules}

Forbidden words:
{bad_words}

Instructions:
{instructions}
"""

In [None]:
from botocore.exceptions import ClientError
import json
from enum import Enum


class DocumentType(str, Enum):
    HTML = "html"
    MARKDOWN = "md"
    PDF = "pdf"
    DOCX = "docx"
    DOC = "doc"
    XLSX = "xlsx"
    XLS = "xls"
    CSV = "csv"
    TXT = "txt"


async def generate_dita(document: bytes, file_name: str, rules: str, bad_words: str, model_id: str, document_type: DocumentType = DocumentType.TXT):

    instructions = """
    - Check the document for compliance with the provided rules.
    - Identify any violations of the rules.
    - Suggest corrections for the identified violations.
    - Ensure the output is in DITA format.

    Answer only in DITA format, without any additional text or explanations. If needed, do multiple files to cover all content.
    """

    system_role = f"You are a technical editor that generates dita files based on given rules.\n\n{base_prompt(job='generate DITA file', rules=rules, bad_words=bad_words, instructions=instructions)}"

    document_reference = {
        "format": document_type,
        "name": file_name,
        "source": {
            "bytes": document,
        }
    }

    user_prompt = "Read the document and generate the DITA file with the necessary corrections based on the provided rules and forbidden words."

    inference_config = {
        "temperature": 0.2,
        "system": system_role,
        "max_tokens": 5000,
    }

    return bedrock_client.converse(
        modelId=model_id,
        system=[{"text": system_role}],
        messages=[
            {
                "role": "user",
                "content": [
                        {"text": user_prompt},
                        {"document": document_reference}
                ]   
            }
        ]
    )

    # except (ClientError, Exception) as e:
    # print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")

with open("txt/PMD_PRISM.IO_Digital_IO_Management.docx", "rb") as file:
    manual = file.read()

print("Generating response")
resp = await generate_corrected_dita(manual, "IO Digital IO Management", rules, bad_words, "arn:aws:bedrock:us-east-1:473326111529:inference-profile/us.anthropic.claude-sonnet-4-20250514-v1:0", document_type=DocumentType.DOCX)

usage = resp["usage"]

Generating response


In [7]:
async def correct_dita(document: str, file_name: str, rules: str, bad_words: str, model_id: str):
    """
    Generate a corrected DITA file based on the provided document and rules.

    Args:
        document (bytes): The document content to be processed.
        file_name (str): The name of the file being processed.
        rules (str): The rules to check against.
        bad_words (str): The forbidden words to avoid.
        model_id (str): The model ID for the inference.
        document_type (DocumentType): The type of the document.

    Returns:
        dict: The response from the Bedrock client.
    """
    instructions = """
    - Modify the text only when necessary.
    - Mark all corrections using the following syntax:
        - ~~Original~~ → **Corrected** (Reason)
    - Do not add new sections unless explicitly required."""

    system_role = f"You are a technical editor that verifies dita files based on given rules.\n\n{base_prompt(job='corrects DITA file', rules=rules, bad_words=bad_words, instructions=instructions)}"
    user_prompt = "Read the document and correct it based on the provided rules and forbidden words."
    inference_config = {
        "temperature": 0.2,
        "system": system_role,
        "max_tokens": 5000,
    }
    return bedrock_client.converse(
        modelId=model_id,
        system=[{"text": system_role}],
        messages=[
            {
                "role": "user",
                "content": [
                    {"text": user_prompt},
                    {"text": document}
                ]
            }
        ]
    )

    
with open("dita/parameters.dita") as file:
    dita = file.read()

print("Generating response")
resp = await correct_dita(dita, "parameters.dita", rules, bad_words, "arn:aws:bedrock:us-east-1:473326111529:inference-profile/us.anthropic.claude-sonnet-4-20250514-v1:0")

usage = resp["usage"]

Generating response


In [8]:
from pathlib import Path

print("Response generated")
output = resp["output"]
response_text = output["message"]["content"][0]["text"]
Path("txt/corrected_dita.dita").write_text(response_text)

usage

Response generated


{'inputTokens': 6815,
 'outputTokens': 998,
 'totalTokens': 7813,
 'cacheReadInputTokens': 0,
 'cacheWriteInputTokens': 0}

## BitBucket connector

This connector is essential to get the dita files and publish changes through pull requests. 

In [4]:
from atlassian import Bitbucket
import os

connector = Bitbucket(
    url=os.getenv("BITBUCKET_URL"),
    username=os.getenv("BITBUCKET_USERNAME"),
    password=os.getenv("BITBUCKET_PASSWORD")
)

## XML/DITA Parser

This parser will have the responsibility to extract and validate the dita syntax, although the validation part could already be performed by the technical writer if he is using oxygen. 

In [None]:
from lxml import etree

def prettyprint(element, **kwargs):
    xml = etree.tostring(element, pretty_print=True, **kwargs)
    print(xml.decode(), end='')



In [None]:
# Load and parse the DITA file
tree = etree.parse("./dita/example.dita")
root = tree.getroot()

for topic in root:
    print(topic.tag)
    prettyprint(topic, encoding='UTF-8', xml_declaration=True)


title []
<?xml version='1.0' encoding='UTF-8'?>
<title>Specifying the IP parameters using the serial connection</title>
    
abstract []
<?xml version='1.0' encoding='UTF-8'?>
<abstract>
        <shortdesc>The procedure  describes how to set up the IP parameters for the device using a serial connection to establish network connectivity for the device.</shortdesc>
        <p>The procedure involves setting up the IP address and subnet mask for the device using a serial connection. This procedure is generally useful when the device has no IP address assigned or is in factory default settings.</p>
    </abstract>
    
taskbody []
<?xml version='1.0' encoding='UTF-8'?>
<taskbody>
        <prereq id="prereq_smf_b4f_sfc">
            <ul id="ul_om5_nkj_wfc">
                <li>A Linux computer with the following packages installed:<ul id="ul_gjn_qkj_wfc">
                        <li>Superuser access (sudo)</li>
                        <li>Minicom</li>
                        <li>yangcli-pro<