In [1]:
import os, textwrap
import io

from typing import List, Dict, Optional
from xml.sax.saxutils import escape

from gpts.gpt_assistants import general_assistant
from dotenv import load_dotenv, find_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.core.exceptions import HttpResponseError
from azure.search.documents.models import HybridSearch


from openai import AzureOpenAI, APIConnectionError, OpenAI
from prompts import new_system_finance_prompt

from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

from prompts4 import finance_calculations, finance_pairs, capital_pairs, stakeholders_pairs, biz_overview_pairs, revenue_pairs, default_gpt_prompt, section4a, section4b, section5, section3, biz_overview_web, stakeholders_web
from pages.design.func_tools import *
from pages.design.formatting import *
from pages.design.func_tools import docx_bytes_to_pdf_bytes
import re, time
 
load_dotenv(find_dotenv(), override=True)

# ---- Config (expects the same envs you already used) ----
SEARCH_ENDPOINT = os.environ["AZURE_SEARCH_ENDPOINT"]
SEARCH_INDEX    = os.environ["AZURE_SEARCH_INDEX"]
SEARCH_KEY      = os.getenv("AZURE_SEARCH_API_KEY")  # omit if using AAD/RBAC
VECTOR_FIELD    = os.getenv("VECTOR_FIELD")
TEXT_FIELD      = os.getenv("TEXT_FIELD")

AOAI_ENDPOINT   = os.environ["AZURE_OPENAI_ENDPOINT"]            # https://<resource>.openai.azure.com
AOAI_API_VER    = os.environ.get("AZURE_OPENAI_API_VERSION", "2024-10-21")
AOAI_DEPLOYMENT = os.environ["AZURE_OPENAI_DEPLOYMENT"]          # e.g., gpt-4o-mini / o3-mini / gpt-5 preview
AOAI_KEY        = os.getenv("AZURE_OPENAI_API_KEY")              # omit if using AAD
OPENAI_API_KEY  = os.getenv("OPENAI_API_KEY")        # required

# ------------------ CODE

class profileAgent():

    """Hybrid (dense+sparse) RAG over Vector Store

    This Agent is responsible for creating Company Profiles. 
    It operates with gpt5.
    It is activated by a call on main rag when it is typed 'Create company profile'
    """

    def __init__(self, company_name, k, max_text_recall_size, max_chars, model, profile_prompt = new_system_finance_prompt, finance_calculations = finance_calculations):
        
        self.company_name = company_name

        self.k = k
        self.max_text_recall_size = max_text_recall_size
        self.model = model
        self.max_chars = max_chars

        self.azure_credentials = AzureKeyCredential(SEARCH_KEY) if SEARCH_KEY else DefaultAzureCredential()
        self.search_client = SearchClient(SEARCH_ENDPOINT, SEARCH_INDEX, credential=self.azure_credentials)

        self.az_openai = AzureOpenAI(azure_endpoint=AOAI_ENDPOINT, api_key=AOAI_KEY, api_version=AOAI_API_VER)
        self.profile_prompt = profile_prompt
        self.web_openai = OpenAI(api_key=OPENAI_API_KEY)

        self.reasoning_effort = "medium"
        self.verbosity = "medium"

        self.finance_calculations = finance_calculations

    def _company_filter(self) -> str:
        v = (self.company_name or "").replace("'", "''").strip()
        return f"company_name eq '{v}'" if v else None
    
    def assemble_bm25_from_llm(self, slots: dict) -> str:
        def q(s: str) -> str:
            # sanitize: remove internal quotes and trim
            s = (s or "").strip().replace('"', ' ')
            return f"\"{s}\"" if s else ""
        groups = []

        # must-have phrases (ANDed)
        for p in slots.get("must_have_phrases", []):
            qp = q(p)
            if qp:
                groups.append(qp)

        # metric / statement synonym groups (ORed within each group)
        for key in ["metric", "statement"]:
            syns = slots.get("synonyms", {}).get(key, []) or slots.get(key, [])
            syns = [q(s) for s in syns if s]
            if syns:
                groups.append("(" + " OR ".join(syns) + ")")

        return " AND ".join(groups) if groups else "\"financial statements\""


    def bm25_creator(self, prompt):

        instruction = (
            "Extract finance search slots for Azure AI Search. "
            "Return strict JSON: {\"metric\":[], \"statement\":[], \"synonyms\":{}, \"must_have_phrases\":[]} "
            "(include IFRS/US GAAP variants)."
        )
        resp = general_assistant(instruction, prompt, OPENAI_API_KEY, 'gpt-4o')

        try:
            slots = getattr(resp, "output_json", None)
            if slots is None:
                import json
                slots = json.loads(resp.output_text)
        except Exception:
            # fallback: minimal anchors from prompt
            slots = {"must_have_phrases": [prompt], "metric": [], "statement": [], "synonyms": {}}
        return self.assemble_bm25_from_llm(slots)

    def _retrieve_hybrid_enhanced(self, query_nl, k: int = 50, top_n = 30, fields=VECTOR_FIELD, max_text_recall_size:int = 800):
        sc = self.search_client
        flt = self._company_filter()
        
        try:
            vq = VectorizableTextQuery(text=query_nl, k=k, fields=VECTOR_FIELD)
            # Prefer vector-only search (integrated vectorization). If your index isn't set up for it, this raises.
            results = sc.search(
                search_text=self.bm25_creator(query_nl), 
                vector_queries=[vq], 
                top=top_n, 
                query_type="semantic",
                query_caption="extractive", 
                hybrid_search=HybridSearch(max_text_recall_size=self.max_text_recall_size),
                query_caption_highlight_enabled=True,
                filter=flt
                )
            mode = "hybrid + semantic"
        except HttpResponseError as e:
            # Fall back to lexical so you still get results while fixing vector config
            results = sc.search(search_text=self.bm25_creator(query_nl), top=k)
            mode = f"lexical (fallback due to: {e.__class__.__name__})"

        hits: List[Dict] = []
        for r in results:
            d = r.copy() if hasattr(r, "copy") else {k2: r[k2] for k2 in r}
            d["score"] = d.get("@search.reranker_score") or d.get("@search.score") or 0.0
            caps = d.get("@search.captions")
            if isinstance(caps, list) and caps:
                d["caption"] = getattr(caps[0], "text", None)
            hits.append(d)

        return mode, hits


    def _build_context(self, hits: List[Dict], text_field: str = TEXT_FIELD, max_chars: int = 20000):
        """Build a compact, numbered context block and also return the selected chunk metadata."""
        lines = []
        total = 0
        selected = []  # <- we'll return this

        for i, h in enumerate(hits, 1):
            title     = h.get("title")
            chunk_id  = h.get("chunk_id")
            full_text = (h.get(text_field) or "")
            if not full_text:
                continue

            preview = textwrap.shorten(full_text, width=700, placeholder=" ...")
            block = f"[{i}] title={title!r} | chunk_id={chunk_id} | score={h.get('score'):.4f}\n{full_text}"

            if total + len(block) > self.max_chars:
                break

            total += len(block)
            lines.append(block)

            # keep rich metadata so you can show or log it later
            selected.append({
                "i": i,
                "title": title,
                "chunk_id": chunk_id,
                "score": h.get("score"),
                "caption": h.get("caption"),
                "preview": preview,
                "text": full_text,  # full chunk text (not shortened)
                # include any other fields you index, if available:
                "metadata_storage_path": h.get("metadata_storage_path"),
                "page_number": h.get("page_number"),
                "doc_type": h.get("doc_type"),
            })

        return "\n\n---\n\n".join(lines), selected

        
    def _generate_pdf(self, text: str) -> bytes:

        buf = io.BytesIO()
        doc = SimpleDocTemplate(buf, pagesize=letter)
        styles = getSampleStyleSheet()
        body = styles["BodyText"]

        story = []
        # Treat double newlines as paragraph breaks; keep single newlines as <br/>
        for para in (text or "").split("\n\n"):
            safe = escape(para).replace("\n", "<br/>")
            story.append(Paragraph(safe if safe.strip() else "&nbsp;", body))
            story.append(Spacer(1, 8))

        doc.build(story)
        buf.seek(0)
        return buf.getvalue()
    
    def _extract_cited_idxs(self, answer: str) -> list[int]:
        # Matches [#1], [#12], etc. (also tolerates stray [1])
        nums = set(int(n) for n in re.findall(r"\[#?(\d+)\]", answer))
        return sorted(nums)

    def _rag_answer(self, rag_nl, question, k: int = 5, temperature: float = 0.2):

        # question = f'CREATE A SECTION OF COMPANY PROFILE USING LAST YEARS OF ANNUAL REPORT PRESENT IN THE CONTEXT FOR {self.company_name}. IF ANY INFORMATION IS NOT FOUND STATE AS n.a. .\n\n THIS IS THE SECTION TO BE BUILT: \n {section7}  \n USE THIS TO GUIDE YOURSELF ON SEMANTIC TERMS AND HOW TO CALCULATE: \n {finance_calculations}'
        
        mode, hits = self._retrieve_hybrid_enhanced(
            # query=rag_q, 
            query_nl=rag_nl,
            k=25
            )
        ctx_text, ctx_items = self._build_context(hits)

        system_msg = self.profile_prompt + (
            "\nWhen you use a fact from the context, add citations like [#1], [#2]."
            "\nOnly rely on the numbered context; if a value is missing, say 'n.a.'."
            f"\nIF ANY INFORMATION IS NOT FOUND STATE AS n.a. .\n\n USE THIS TO GUIDE YOURSELF ON SEMANTIC TERMS AND HOW TO CALCULATE: \n {finance_calculations}"
        )
        user_msg = f"Question:\n{question}\n\nContext snippets (numbered):\n{ctx_text}"

        client = self.az_openai
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user",   "content": user_msg},
        ]

        # Try streaming first (SSE). Some networks/proxies block streaming; if so, fall back.
        
        resp = client.chat.completions.create(
            model=AOAI_DEPLOYMENT,
            messages=messages,
            reasoning_effort="high"
        )
        answer = resp.choices[0].message.content
        mode_model = "non-streaming (fallback)"

        cited = self._extract_cited_idxs(answer)
        used_chunks = [c for c in ctx_items if c["i"] in cited]

        # return self._generate_pdf(answer)
        return {
            "answer": answer,
            "citations": cited,          # [1, 3, 7]
            "used_chunks": used_chunks,  # detailed dicts for each cited snippet
            "all_chunks": ctx_items,     # everything you sent (optional)
            "mode": mode                 # retrieval mode info (optional)
        }

    def _web_search(self, messages):
        resp = self.web_openai.responses.create(
            model='gpt-5',
            input=messages,
            tools=[{"type": "web_search"}],
            tool_choice="auto",
            # max_output_tokens=self.max_output_tokens,
            reasoning={"effort": self.reasoning_effort},
            text={"verbosity": self.verbosity},
        )
        
        return resp.output_text
    
    def _answer(self, question, ctx_text, k: int = 5, temperature: float = 0.2):

        system_msg = self.profile_prompt + (
            "\nWhen you use a fact from the context, preserve any existing citations like [#1], [#2], [#5, p.41] that are already in the context text."
            "\nOnly rely on the provided context; if a value is missing, say 'n.a.'."
            "\nIMPORTANT: If the formatting instructions request a Sources section, you MUST include it at the end."
            "\nFor the Sources section, list all citation numbers/references that appear in your answer, and describe what document/source each refers to based on information in the context."
        )
        user_msg = f"Question:\n{question}\n\nContext snippets:\n{ctx_text}"

        client = self.az_openai
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user",   "content": user_msg},
        ]

        # Try streaming first (SSE). Some networks/proxies block streaming; if so, fall back.

        resp = client.chat.completions.create(
            model=AOAI_DEPLOYMENT,
            messages=messages,
            reasoning_effort="high"
        )
        answer = resp.choices[0].message.content

        cited = self._extract_cited_idxs(answer)

        # return self._generate_pdf(answer)
        return {
            "answer": answer,
            "citations": cited,          # [1, 3, 7]
        }   
    
    @staticmethod
    def has_na(text: str) -> bool:
        # match "n.a." or "n/a" (case-insensitive)
        return bool(re.search(r"\b(n\.a\.|n/a)\b", text, flags=re.I))

    def _sections(self, pairs):

        answers = []

        max_extra_na_retries = 1        # try again at most 2 times (total <= 3 calls per item)
        base_delay_seconds = 3.0        # polite delay between attempts


        for q, r in pairs:
            tries = 0
            while True:
                if tries > 0:
                    # small incremental delay before re-trying
                    time.sleep(base_delay_seconds + 0.5 * tries)

                resp = self._rag_answer(rag_nl=r[0], question=q[0])
                answer_text = resp["answer"]

                # stop if good answer OR we've exhausted retries
                if not profileAgent.has_na(answer_text) or tries >= max_extra_na_retries:
                    answers.append(answer_text)
                    break

                # otherwise, try again
                tries += 1

            # optional small gap between different (r,q) items
            time.sleep(5.0)
        
        return answers
    
    def _generate_section(self, section):

        if section == 'GENERATE BUSINESS OVERVIEW':
            # =========== GENERATE BUSINESS OVERVIEW
            biz_overview_pairs_flat = list(zip(biz_overview_pairs[1], biz_overview_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs = biz_overview_pairs_flat)

            #getting web search sections
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{biz_overview_web} \n\n Mention in the Beggining of the answer that this is WEBSEARCH SOURCE'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp_web = self._web_search(messages)

            section_built.append(resp_web)

            # Join all context sections - they already contain their own citations
            # Just concatenate them so the model can synthesize
            ctx_text_formatted = "\n\n".join(section_built)

            resp = self._answer(question=biz_overview_mix_formatting, ctx_text=ctx_text_formatted)
            return resp['answer']
        elif section == 'GENERATE KEY STAKEHOLDERS':
        # =========== GENERATE KEY STAKEHOLDERS
            stakeholders_pairs_flat = list(zip(stakeholders_pairs[1], stakeholders_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs= stakeholders_pairs_flat)

            #getting web search sections
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{stakeholders_web} \n\n Mention in the Beggining of the answer that this is WEBSEARCH SOURCE'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp_web = self._web_search(messages)

            section_built.append(resp_web)

            # Join all context sections - they already contain their own citations
            # Just concatenate them so the model can synthesize
            ctx_text_formatted = "\n\n".join(section_built)

            resp = self._answer(question=stakeholders_web_mix, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE FINANCIAL HIGHLIGHTS':
            # =========== GENERATE FINANCIAL HIGHLIGHTS
            finance_pairs_flat = list(zip(finance_pairs[1], finance_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs=finance_pairs_flat)
            resp = self._answer(question=finance_formatting_2, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE CAPITAL STRUCTURE':
            # =========== GENERATE CAPITAL STRUCTURE
            capital_pairs_flat = list(zip(capital_pairs[1], capital_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs= capital_pairs_flat)
            resp = self._answer(question=capital_structure_formatting_2, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE REVENUE SPLIT':
            # =========== GENERATE CAPITAL STRUCTURE
            revenue_pairs_flat = list(zip(revenue_pairs[1], revenue_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs= revenue_pairs_flat)
            resp = self._answer(question=revenue_split_formatting, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE PRODUCTS SERVICES OVERVIEW':
            # =========== GENERATE CAPITAL STRUCTURE
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{products_overview_formatting}'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp = self._web_search(messages)
            return resp 
        elif section == 'GENERATE GEO FOOTPRINT':
            # =========== GENERATE CAPITAL STRUCTURE
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{geo_footprint_formatting}'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp = self._web_search(messages)
            return resp
        elif section == 'GENERATE DEVELOPMENTS HIGHLIGHTS':
            # =========== GENERATE CAPITAL STRUCTURE
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{key_devs_formatting}'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp = self._web_search(messages)
            return resp


    def generate_company_profile(self):

        # =========== GENERATE BUSINESS OVERVIEW
        resp = self._generate_section('GENERATE BUSINESS OVERVIEW')
        doc = insert_biz_overview(resp['answer'])

        time.sleep(60)
        # =========== GENERATE KEY STAKEHOLDERS
        resp = self._generate_section('GENERATE KEY STAKEHOLDERS')
        doc = insert_stakeholders(resp['answer'], doc=doc)
        
        time.sleep(60)
        # =========== GENERATE FINANCIAL HIGHLIGHTS
        resp = self._generate_section('GENERATE FINANCIAL HIGHLIGHTS')
        doc = insert_finance(resp['answer'], doc=doc)

        time.sleep(60)
        # =========== GENERATE CAPITAL STRUCTURE
        resp = self._generate_section('GENERATE CAPITAL STRUCTURE')
        doc = insert_capital_structure(resp['answer'], doc=doc)

        pdf_bytes = docx_bytes_to_pdf_bytes(doc)

        return pdf_bytes
        # =========== UNION


In [2]:
from prompts4 import section7, finance_calculations, system_mod
import time
import re, time

company = 'SEAPORT_TOPCO_LIMITED'
sys = system_mod
calc = finance_calculations

agent = profileAgent(
    company_name = company,
    k=50, 
    max_text_recall_size=35, 
    max_chars=10000,
    model='gpt-5', 
    profile_prompt= sys,
    finance_calculations= calc
)




In [None]:
biz_ov = agent._generate_section('GENERATE BUSINESS OVERVIEW')

print(biz_ov)

k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored


 
1. Business Overview

- Seaport Topco Limited is the UK holding company for a life sciences tools group trading as SPT Labtech that designs and manufactures automated instruments, consumables and related support services to help laboratories increase productivity and reproducibility in research [#1].
- The company focuses on automation and miniaturisation across high‑impact segments of life science research and has a long operating history through SPT Labtech, which was founded in 1997, spun out from TTP Group in 2018 and acquired by EQT Private Equity in 2022 to support global expansion and innovation [#1].
- It offers a portfolio spanning liquid handling, sample preparation for cryo‑EM and structural biology and sample management, with recognised product families including mosquito, dragonfly, firefly, apricot, BioMicroLab, chameleon and Quantifoil [#2].
- The company operates with global R&D and manufacturing in Melbourn UK and has additional production, commercial hubs and office

In [4]:
key_stake = agent._generate_section('GENERATE KEY STAKEHOLDERS')

print(key_stake)

k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a kn

2. Key Stakeholders

| Metric | Details |
| --- | --- |
| Shareholders | - Immediate parent company: EQT Jupiter Luxco S.à r.l. (Luxembourg) [1] [4]  <br> - Ultimate parent company: n/a (not disclosed; notes state the smallest and largest group in which results are consolidated is headed by Seaport Topco Limited) [1] [4] |
| Management | - Chairman: Kieran Murphy (Non‑Executive Chair; per SPT Labtech website)  <br> - Chief Executive Officer: Rob Walton (appointed CEO May-24 per website; appointed director of Seaport Topco Limited on 22-Jul-24 and signed FY24 Directors’ Report on 29-Apr-25) [#3 p.12] [#6 p.16]  <br> - Chief Financial Officer: Andrew Holford (Group CFO; joined Dec-24 per website)  <br> - Directors per FY24 Directors’ Report: M Bauer; R Diggelmann — resigned 25-Dec-24; P Dowdy — resigned 24-Feb-25; J Feldman; R Friel — resigned 3-Mar-25; K Murphy; D Newble — resigned 8-Jul-24; A Thorburn; R Walton — appointed 22-Jul-24 [#3 p.12] |
| Lenders | - External lender names: n.a.

In [5]:
fin_high = agent._generate_section('GENERATE FINANCIAL HIGHLIGHTS')
print(fin_high)

k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a kn

 
6. Financial Highlights

| Metric | FY24 | FY23 | FY22 |
| --- | --- | --- | --- |
| Revenue (Turnover) | £76.8m [1] | £81.4m [2] | £32.8m [2] |
| Revenue growth % (yoy) | -5.7% [1][2] | +148.1% [2] | n.a. |
| Gross profit | £43.9m [1] | £48.4m [2] | £14.3m [2] |
| Gross margin % | 57.2% [1][2] | 59.5% [1][2] | 43.7% [2] |
| EBITDA | £10.7m [1] | £20.2m [2] | -£1.2m [2] |
| EBITDA margin % | 14.0% [1][2] | 24.8% [1][2] | n.m. |
| Net working capital (cash flow movement) | -£1.2m [1] | £3.9m [2] | -£3.0m [2] |
| Cash flow from operating activities excl. NWC | £10.8m [1] | £11.4m [2] | £1.2m [2] |
| Capex | -£2.9m [1] | -£7.2m [2] | -£0.9m [2] |
| Other cash flow from investing activities | £0.2m [1] | -£0.8m [2] | -£638.1m [2] |
| CFADS | £6.9m [1][2] | £7.3m [2] | -£640.7m [2] |
| Net cash from financing activities | -£6.7m [1] | -£4.2m [1] | £657.6m [2] |
| Bank loan drawdowns (cash) | £17.9m [1] | £12.1m [1] | £187.3m [2] |
| Other loans drawdowns (cash) | £7.7m [1] | £0.0m [1] | n

In [6]:
cap_stru = agent._generate_section('GENERATE CAPITAL STRUCTURE')

print(cap_stru)

k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a kn

7. Capital Structure

| Metric | FY24 |
| --- | --- |
| Facility Name | Senior Facilities Agreement — Facility B1 (EUR) and Facility B2 (USD); Revolving Credit Facility |
| Interest Rate | B1: EURIBOR + 6.25%; B2: Term SOFR + 6.25%; RCF: n.a. [#1] |
| Maturity | B1/B2: Aug-29; RCF: n.a. [#1] |
| Adjusted EBITDA | £12.9m [#3] |
| Cash (Closing Cash) | £11.8m [#4] |
| Net Debt | £186.7m [#5] |
| Liquidity | £16.6m [#1][#2][#4] |
| Leverage (Net Debt/EBITDA) | 14.5x [#5][#3] |
| Facility B1 outstanding (GBP) | £34.2m [#1] |
| Facility B2 outstanding (GBP) | £137.2m [#1] |
| RCF drawn | £25.2m [#2] |
| RCF facility size | £30.0m [#1] |
| Delayed Drawdown Facility size | n.a. |
| Bank loans due after >5 years | n.a. |
| Bank loans due within 1 year | n.a. |
| Bank loans + RCF outstanding (excl. leases) | £196.6m [#1][#2] |

Summary / Interpretation
- Net debt is £186.7m against Adjusted EBITDA of £12.9m, resulting in very high leverage of 14.5x, indicating that debt is large relative to ear

In [7]:
rev_split = agent._generate_section('GENERATE REVENUE SPLIT')

print(rev_split)

k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored


3. Revenue Split

Revenue by geography (country of destination) [#5 p.53]
| Segment | Revenue (£m) | Share (%) |
| --- | --- | --- |
| UK | 9.2 | 12.0 |
| Europe | 15.9 | 20.7 |
| North America | 35.1 | 45.7 |
| Rest of world | 16.7 | 21.7 |
| Total | 76.8 | 100.0 |

Revenue by activity (class of business) [#5 p.53]
| Segment | Revenue (£m) | Share (%) |
| --- | --- | --- |
| Sale of goods | 60.5 | 78.8 |
| Services | 16.3 | 21.2 |
| Total | 76.8 | 100.0 |


Sources
- [#5] Seaport Topco Limited FY24 Annual Report, Note 5 Turnover & Other operating income (p.53): revenue by destination and by class of business for FY24. Link: https://aiprojectteneo.blob.core.windows.net/companieshouselinglefile/14171962/SEAPORT_TOPCO_LIMITED_AA_annualReport_2025-09-30_0.pdf


In [8]:
prod_serv_ov = agent._generate_section('GENERATE PRODUCTS SERVICES OVERVIEW')
print(prod_serv_ov)

4a. Products/Services Overview

1 - SUMMARY
Based on the latest filed Group accounts for FY24 (approved Apr-25) and the official product sites of the Group’s operating subsidiaries, Seaport Topco Limited provides instruments, consumables and automation solutions focused on automated liquid handling, sample preparation (including cryo‑EM) and sample management for life science research.

- firefly (incl. firefly+): an all‑in‑one compact liquid handling platform that combines pipetting, dispensing, incubation and shaking to automate NGS library and sample preparation workflows. ([sptlabtech.com](https://www.sptlabtech.com/products/firefly?utm_source=openai))
- mosquito family (Gen3, LV genomics, HV genomics, Xtal3, LCP): positive‑displacement nanoliter pipetting instruments for genomics miniaturization and protein crystallography, delivering accurate 25 nL–5 µL transfers with disposable micropipettes. ([sptlabtech.com](https://www.sptlabtech.com/products/mosquito?utm_source=openai))
- dr

In [9]:
geo_foot = agent._generate_section('GENERATE GEO FOOTPRINT')

print(geo_foot)

4b. Geographical Footprint

| Country | Office (HQ/other) | Manufacturing facility | Sales/commercial office | Notes (evidence) |
| --- | --- | --- | --- | --- |
| United Kingdom | Yes (HQ) | Yes | Not disclosed | Melbourn (Cambridge) is the global R&D and production centre for the core SPT Labtech range; Seaport Topco’s and key UK subsidiaries’ registered office: Building F, Melbourn Science Park, Cambridge Road, Melbourn SG8 6HB. ([sptlabtech.com](https://www.sptlabtech.com/company)) |
| United States | Yes | Yes | Not disclosed | Covina, CA is the production centre for Apricot Designs and BioMicroLab ranges; group subsidiaries include Apricot Designs, Inc. (677 Arrow Grand Circle, Covina, CA 91722) and SPT Labtech, Inc. (One Boston Place, Suite 26000, Boston, MA 02108). ([sptlabtech.com](https://www.sptlabtech.com/company)) |
| Germany | Yes | Yes | Not disclosed | Quantifoil MicroTools GmbH site in Jena is the global R&D and production centre for Quantifoil; address: In den Brücken

In [10]:
dev_high = agent._generate_section('GENERATE DEVELOPMENTS HIGHLIGHTS')
print(dev_high)

5. Key Developments

- Oct-25: Seaport Topco Limited, through operating company SPT Labtech, announced a strategic partnership with 10x Genomics to provide automated single‑cell workflows on the firefly platform, with future expansion planned for Visium Spatial. ([sptlabtech.com](https://www.sptlabtech.com/news/spt-labtech-partners-with-10x?utm_source=openai))
- Jul-25: Seaport Topco Limited, via SPT Labtech, entered a collaboration with Semarion to integrate the firefly liquid handling platform with SemaCyte microcarriers to advance automated, miniaturized cell‑based assay workflows. ([sptlabtech.com](https://www.sptlabtech.com/news/spt-labtech-and-semarion-collaborate-to-advance-automated-cell-based-assay-workflows?utm_source=openai))
- Jun-25: Seaport Topco Limited reported that SPT Labtech was named an Illumina Qualified Methods Provider after qualifying an automated Illumina DNA Prep workflow on the firefly platform. ([sptlabtech.com](https://www.sptlabtech.com/news/spt-labtech-na

In [11]:
import io
import re
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH

def markdown_table_to_docx(markdown_text: str, output_path: str):
    """
    Convert markdown text with MULTIPLE tables to a Word document.
    Handles multiple tables with different column counts.
    """
    
    lines = markdown_text.strip().split('\n')
    
    # Create document
    doc = Document()
    
    # Process line by line
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Detect table start
        if line.startswith('|'):
            # Collect all consecutive table lines
            table_lines = []
            while i < len(lines) and lines[i].strip().startswith('|'):
                line = lines[i].strip()
                if '---' not in line:  # Skip separator lines
                    table_lines.append(line)
                i += 1
            
            # Create table in Word
            if table_lines:
                rows = []
                for line in table_lines:
                    cells = [c.strip() for c in line.strip('|').split('|')]
                    rows.append(cells)
                
                # Ensure all rows have same number of columns
                if rows:
                    max_cols = max(len(row) for row in rows)
                    
                    # Pad rows with fewer columns
                    for row in rows:
                        while len(row) < max_cols:
                            row.append('')
                    
                    # Create Word table
                    table = doc.add_table(rows=len(rows), cols=max_cols)
                    table.style = 'Light Grid Accent 1'
                    
                    # Populate table
                    for row_idx, row_data in enumerate(rows):
                        for col_idx, cell_data in enumerate(row_data):
                            if col_idx < len(table.rows[row_idx].cells):
                                cell = table.rows[row_idx].cells[col_idx]
                                cell.text = cell_data
                                
                                # Style header row
                                if row_idx == 0:
                                    for paragraph in cell.paragraphs:
                                        for run in paragraph.runs:
                                            run.bold = True
                    
                    # Add spacing after table
                    doc.add_paragraph()
        
        # Handle headings (Summary / Interpretation, Sources, etc.)
        elif line in [
            '1. Business Overview', '2. Key Stakeholders', '3. Revenue Split', '4a. Products/Services Overview', '4b. Geographical Footprint',
            '5. Key Developments', '6. Financial Highlights', '7. Capital Structure']:
            para = doc.add_paragraph(line)
            para.runs[0].bold = True
            para.runs[0].font.size = Pt(16)
            i += 1

        elif line in [
            'Summary / Interpretation', 'Sources:', 'Sources']:
            para = doc.add_paragraph(line)
            para.runs[0].bold = True
            para.runs[0].font.size = Pt(12)
            i += 1
        
        # Handle bullet points
        elif line.startswith('-') or line.startswith('•'):
            para = doc.add_paragraph(line[1:].strip(), style='List Bullet')
            i += 1
        
        # Handle regular paragraphs
        elif line:
            doc.add_paragraph(line)
            i += 1
        
        # Skip empty lines
        else:
            i += 1
    
    # Save
    doc.save(output_path)
    print(f"✓ Saved to: {output_path}")
    return doc

In [12]:
markdown_table_to_docx(biz_ov, "output_from_markdown.docx")

✓ Saved to: output_from_markdown.docx


<docx.document.Document at 0x12989a2b0>

In [13]:
# all = "\n\n".join([biz_ov])
# biz_ov + key_stake + fin_high + cap_stru + rev_split + prod_serv_ov + geo_foot + dev_high

all = "\n\n".join([biz_ov, key_stake, rev_split, prod_serv_ov,geo_foot, dev_high, fin_high, cap_stru])


markdown_table_to_docx(all, "output_from_markdown.docx")

✓ Saved to: output_from_markdown.docx


<docx.document.Document at 0x12989ab50>

In [None]:
import io
import re
from docx import Document
from docx.shared import Pt, RGBColor, Inches, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH

def markdown_table_to_docx(markdown_text: str, output_path: str, logo_path: str = None):
    """
    Convert markdown to Docx with a logo positioned at top-left.
    - Left: -3cm indent
    - Height: Moved up by reducing header distance to 0.5cm
    """
    
    lines = markdown_text.strip().split('\n')
    doc = Document()
    
    # --- LOGO POSITIONING START ---
    if logo_path:
        section = doc.sections[0]
        header = section.header
        header_para = header.paragraphs[0]
        
        # 1. VERTICAL POSITION (Height)
        # "Header distance" is the gap from the top edge of the paper to the start of the header.
        # Default is usually ~1.27cm. Setting it to 0.5cm moves the logo UP.
        section.header_distance = Cm(0.5)
        
        # 2. HORIZONTAL POSITION (Left)
        # Align left and use negative indent to pull it into the margin.
        header_para.alignment = WD_ALIGN_PARAGRAPH.LEFT
        header_para.paragraph_format.left_indent = Cm(-3)
        
        # Remove any extra spacing that might push it down
        header_para.paragraph_format.space_before = Pt(0)
        header_para.paragraph_format.space_after = Pt(0)
        
        # 3. INSERT IMAGE
        run = header_para.add_run()
        try:
            # Adjust width as needed
            run.add_picture(logo_path, width=Inches(1))
        except FileNotFoundError:
            print(f"Warning: Logo file not found at {logo_path}")
    # --- LOGO POSITIONING END ---
    
    # Process text content
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Detect table start
        if line.startswith('|'):
            table_lines = []
            while i < len(lines) and lines[i].strip().startswith('|'):
                line = lines[i].strip()
                if '---' not in line:
                    table_lines.append(line)
                i += 1
            
            if table_lines:
                rows = []
                for line in table_lines:
                    cells = [c.strip() for c in line.strip('|').split('|')]
                    rows.append(cells)
                
                if rows:
                    max_cols = max(len(row) for row in rows)
                    
                    # Pad rows
                    for row in rows:
                        while len(row) < max_cols:
                            row.append('')
                    
                    table = doc.add_table(rows=len(rows), cols=max_cols)
                    table.style = 'Light Grid Accent 1'
                    
                    for row_idx, row_data in enumerate(rows):
                        for col_idx, cell_data in enumerate(row_data):
                            if col_idx < len(table.rows[row_idx].cells):
                                cell = table.rows[row_idx].cells[col_idx]
                                cell.text = cell_data
                                if row_idx == 0:
                                    for paragraph in cell.paragraphs:
                                        for run in paragraph.runs:
                                            run.bold = True
                    doc.add_paragraph()
        
        # Headings
        elif line in [
            '1. Business Overview', '2. Key Stakeholders', '3. Revenue Split', 
            '4a. Products/Services Overview', '4b. Geographical Footprint',
            '5. Key Developments', '6. Financial Highlights', '7. Capital Structure']:
            para = doc.add_paragraph(line)
            para.runs[0].bold = True
            para.runs[0].font.size = Pt(16)
            i += 1

        elif line in ['Summary / Interpretation', 'Sources:', 'Sources']:
            para = doc.add_paragraph(line)
            para.runs[0].bold = True
            para.runs[0].font.size = Pt(12)
            i += 1
        
        # Bullet points
        elif line.startswith('-') or line.startswith('•'):
            doc.add_paragraph(line[1:].strip(), style='List Bullet')
            i += 1
        
        # Regular paragraphs
        elif line:
            doc.add_paragraph(line)
            i += 1
        else:
            i += 1
    
    doc.save(output_path)
    print(f"✓ Saved to: {output_path}")
    return doc
markdown_table_to_docx(
    all, 
    "output.docx", 
    logo_path="logo_teneo.png"
)

✓ Saved to: output.docx


<docx.document.Document at 0x12a7b9ca0>

In [11]:
import io, re
import pandas as pd
from docx import Document
from docx.enum.text import WD_BREAK
from typing import Dict, List, Optional, Union

def insert_table_data_generic(
    gpt_output: str,
    doc_path: str,
    table_type: str,
    metric_mapping: Optional[Dict[str, str]] = None,
    doc: Optional[Document] = None
) -> Document:
    """
    Generic function to insert GPT-generated data into docx tables.
    
    Args:
        gpt_output: The GPT response containing CSV/Table data and summary
        doc_path: Path to the docx file
        table_type: Type of table - 'capital_structure', 'financial_performance', or 'key_stakeholders'
        metric_mapping: Optional dict to map CSV metric names to template row labels
                       Keys should be normalized (lowercase, no special chars)
        doc: Optional existing Document object. If None, will load from doc_path
    
    Returns:
        Updated Document object
    """
    
    # =========================
    # Helper functions
    # =========================
    def norm(s: str) -> str:
        """Normalize string for comparison (lowercase, alphanumeric only)"""
        return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
    
    def keynorm(s: str) -> str:
        """Normalize string for dictionary keys"""
        return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
    
    def tokens(s: str) -> set:
        return set(re.findall(r"[a-z0-9]+", (s or "").lower()))
    
    def jaccard(a: str, b: str) -> float:
        """Calculate Jaccard similarity between two strings"""
        ta, tb = tokens(a), tokens(b)
        if not ta or not tb:
            return 0.0
        inter = len(ta & tb)
        union = len(ta | tb)
        return inter / union if union else 0.0
    
    def set_cell_text(cell, text: str):
        """Set cell text preserving formatting"""
        if not cell.paragraphs:
            cell.add_paragraph(text)
            return
        p = cell.paragraphs[0]
        for run in p.runs:
            run.text = ""
        p.add_run(text)
    
    def set_paragraph_multiline(paragraph, text: str):
        """Replace a paragraph's text with multi-line content, preserving line breaks."""
        for run in paragraph.runs:
            run.text = ""
        lines = (text or "").splitlines()
        if not lines:
            return
        paragraph.add_run(lines[0])
        for ln in lines[1:]:
            r = paragraph.add_run()
            r.add_break(WD_BREAK.LINE)
            paragraph.add_run(ln)
    
    # =========================
    # 1) Extract CSV/Table + Summary from GPT output
    # =========================
    parts = gpt_output.split("\n\nSummary / Interpretation", 1)
    csv_block = parts[0].strip()
    
    # Find where the table/CSV starts
    start = csv_block.find("Metric,")
    if start == -1:
        # Try to find markdown table format
        start = csv_block.find("| Metric |")
        if start != -1:
            # Convert markdown table to CSV
            lines = csv_block[start:].split("\n")
            csv_lines = []
            for line in lines:
                line = line.strip()
                if line.startswith("|") and "---" not in line:
                    # Remove leading/trailing pipes and split
                    cells = [c.strip() for c in line.strip("|").split("|")]
                    csv_lines.append(",".join(f'"{c}"' if "," in c else c for c in cells))
                elif not line.startswith("|"):
                    break
            csv_block = "\n".join(csv_lines)
        else:
            raise ValueError("CSV/Table header 'Metric,' not found in model output.")
    else:
        csv_block = csv_block[start:]
    
    summary_text = ""
    if len(parts) > 1:
        summary_text = "Summary / Interpretation" + parts[1].rstrip()
    
    # =========================
    # 2) Parse CSV to DataFrame
    # =========================
    df = pd.read_csv(io.StringIO(csv_block))
    
    # Determine expected columns based on table type
    if table_type == "financial_performance":
        expected_cols = {"Metric", "FY24", "FY23", "FY22"}
        year_cols = ["FY24", "FY23", "FY22"]
    elif table_type == "capital_structure":
        expected_cols = {"Metric", "FY24"}
        year_cols = ["FY24"]
    elif table_type == "key_stakeholders":
        expected_cols = {"Metric", "Shareholders"}
        year_cols = ["Shareholders"]
    else:
        raise ValueError(f"Unknown table_type: {table_type}")
    
    if not expected_cols.issubset(df.columns):
        raise ValueError(f"{table_type} CSV columns missing. Expected {expected_cols}, Got: {list(df.columns)}")
    
    # Create dict: normalized metric name -> values
    if table_type in ["financial_performance", "capital_structure"]:
        csv_rows = {
            keynorm(str(df.at[i, "Metric"]).strip()): {
                col: str(df.at[i, col]) for col in year_cols
            }
            for i in range(len(df))
        }
    else:  # key_stakeholders
        csv_rows = {
            keynorm(str(df.at[i, "Metric"]).strip()): str(df.at[i, "Shareholders"]).strip()
            for i in range(len(df))
        }
    
    # =========================
    # 3) Open DOCX and find the target table
    # =========================
    if doc is None:
        doc = Document(doc_path)
    
    def find_table_by_type(document: Document, ttype: str):
        """Find table based on type"""
        if ttype == "financial_performance":
            # Look for table with FY24/FY23/FY22 headers
            for tbl in document.tables:
                if len(tbl.rows):
                    header = " ".join(c.text for c in tbl.rows[0].cells)
                    if all(x in norm(header) for x in ["fy24", "fy23", "fy22"]):
                        return tbl
            # Fallback: after "Financial Performance" heading
            return find_table_after_heading(document, "financialperformance")
        
        elif ttype == "capital_structure":
            # Look for table after "Capital Structure" heading
            return find_table_after_heading(document, "capitalstructure")
        
        elif ttype == "key_stakeholders":
            # Look for table with "Title" and "Occupants" columns
            for tbl in document.tables:
                if not tbl.rows:
                    continue
                header = [norm(c.text) for c in tbl.rows[0].cells]
                if "title" in header and "occupants" in header:
                    return tbl
            # Fallback: after "Key Stakeholders" heading
            return find_table_after_heading(document, "keystakeholders")
        
        return None
    
    def find_table_after_heading(document: Document, heading_normalized: str):
        """Find first table after a specific heading"""
        found_heading = False
        body = document._element.body
        for child in body.iterchildren():
            tag = child.tag.rsplit("}", 1)[-1]
            if tag == "p":
                p_text = "".join(
                    t.text for t in child.iter() if t.tag.rsplit("}", 1)[-1] == "t"
                ).strip()
                if norm(p_text) == heading_normalized:
                    found_heading = True
            elif tag == "tbl" and found_heading:
                from docx.table import Table
                return Table(child, document)
        return None
    
    table = find_table_by_type(doc, table_type)
    if table is None:
        raise RuntimeError(f"Could not locate the '{table_type}' table.")
    
    # =========================
    # 4) Populate the table
    # =========================
    
    # Detect column indices from header row
    header_row = table.rows[0]
    col_map = {}  # normalized header -> column index
    for idx, cell in enumerate(header_row.cells):
        col_map[norm(cell.text)] = idx
    
    # Build mapping for year columns
    year_col_indices = {}
    if table_type in ["financial_performance", "capital_structure"]:
        for year in year_cols:
            year_norm = norm(year)
            if year_norm in col_map:
                year_col_indices[year] = col_map[year_norm]
    else:  # key_stakeholders
        # Value column could be "Shareholders" or "Occupants"
        value_col_idx = col_map.get("shareholders") or col_map.get("occupants")
        if value_col_idx is None:
            value_col_idx = 1  # Default to second column
    
    # Populate data rows
    for row_idx in range(1, len(table.rows)):
        row = table.rows[row_idx]
        label_cell = row.cells[0]
        label_text = label_cell.text.strip()
        label_norm = keynorm(label_text)
        
        # Try direct match first
        matched_key = None
        if label_norm in csv_rows:
            matched_key = label_norm
        elif metric_mapping and label_norm in metric_mapping:
            # Use provided mapping
            mapped_key = keynorm(metric_mapping[label_norm])
            if mapped_key in csv_rows:
                matched_key = mapped_key
        else:
            # Try fuzzy matching with Jaccard similarity
            best_score = 0.0
            for csv_key in csv_rows.keys():
                score = jaccard(label_norm, csv_key)
                if score > best_score and score >= 0.6:  # Threshold
                    best_score = score
                    matched_key = csv_key
        
        # Populate cells if we found a match
        if matched_key:
            if table_type in ["financial_performance", "capital_structure"]:
                for year, col_idx in year_col_indices.items():
                    if col_idx < len(row.cells):
                        value = csv_rows[matched_key].get(year, "")
                        set_cell_text(row.cells[col_idx], str(value))
            else:  # key_stakeholders
                if value_col_idx < len(row.cells):
                    value = csv_rows[matched_key]
                    set_cell_text(row.cells[value_col_idx], str(value))
    
    # =========================
    # 5) Insert Summary below the table (if present)
    # =========================
    if summary_text:
        # Find the table in the document body and add summary after it
        table_elem = table._element
        parent = table_elem.getparent()
        table_idx = list(parent).index(table_elem)
        
        # Look for existing summary paragraph after the table
        summary_inserted = False
        for i in range(table_idx + 1, len(parent)):
            child = parent[i]
            tag = child.tag.rsplit("}", 1)[-1]
            if tag == "p":
                p_text = "".join(
                    t.text for t in child.iter() if t.tag.rsplit("}", 1)[-1] == "t"
                ).strip()
                if "summary" in norm(p_text) or "interpretation" in norm(p_text):
                    # Found summary section - update it
                    from docx.text.paragraph import Paragraph
                    para = Paragraph(child, doc)
                    set_paragraph_multiline(para, summary_text)
                    summary_inserted = True
                    break
            elif tag == "tbl":
                # Hit another table, stop looking
                break
        
        # If no existing summary found, add new paragraph
        if not summary_inserted:
            # Add paragraph after table
            new_para = doc.add_paragraph()
            set_paragraph_multiline(new_para, summary_text)
    
    return doc


# =========================
# Convenience wrapper functions
# =========================

def insert_capital_structure(gpt_output: str, doc_path: str = None, doc: Document = None) -> Document:
    """Insert capital structure data into docx table."""
    return insert_table_data_generic(
        gpt_output=gpt_output,
        doc_path=doc_path,
        table_type="capital_structure",
        doc=doc
    )


def insert_finance(gpt_output: str, doc_path: str = None, doc: Document = None,
                   metric_mapping: Optional[Dict[str, str]] = None) -> Document:
    """Insert financial performance data into docx table."""
    # Default metric mapping for financial performance
    if metric_mapping is None:
        metric_mapping = {
            keynorm("Revenue (Turnover)"): "Revenue",
            keynorm("Revenue growth % (yoy)"): "Revenue Growth",
            keynorm("Gross profit"): "Gross Profit",
            keynorm("Gross margin %"): "Gross Margin",
            keynorm("EBITDA"): "EBITDA",
            keynorm("EBITDA margin %"): "EBITDA Margin",
            keynorm("Adjusted EBITDA"): "Adjusted EBITDA",
            keynorm("Capex (tangible+intangible)"): "CAPEX",
            keynorm("CFADS"): "CFADS",
            keynorm("Net working capital change"): "Net Working Capital Change",
            keynorm("Total debt"): "Total Debt",
            keynorm("Net debt"): "Net Debt",
            keynorm("Leverage (Net Debt/EBITDA)"): "Leverage",
        }
    
    return insert_table_data_generic(
        gpt_output=gpt_output,
        doc_path=doc_path,
        table_type="financial_performance",
        metric_mapping=metric_mapping,
        doc=doc
    )


def insert_stakeholders(gpt_output: str, doc_path: str = None, doc: Document = None) -> Document:
    """Insert key stakeholders data into docx table."""
    return insert_table_data_generic(
        gpt_output=gpt_output,
        doc_path=doc_path,
        table_type="key_stakeholders",
        doc=doc
    )


# Helper to ensure keynorm is available
def keynorm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

In [20]:
def insert_capital_structure_with_mapping(gpt_output: str, doc_path: str = None, doc: Document = None) -> Document:
    """
    Insert capital structure with custom metric mapping for your template.
    """
    
    # Define the metric mapping (template label -> CSV metric name)
    metric_mapping = {
        # Normalize both sides
        keynorm("Facility Name"): keynorm("Facility Name"),
        keynorm("Interest Rate"): keynorm("Interest Rate"),
        keynorm("Maturity"): keynorm("Maturity"),
        keynorm("Amount Outstanding"): keynorm("Bank loans + RCF outstanding (excl. leases)"),  # THIS IS KEY!
        keynorm("Gross External Debt"): keynorm("Bank loans + RCF outstanding (excl. leases)"),
        keynorm("Cash (Closing Cash)"): keynorm("Cash (Closing Cash)"),
        keynorm("Net External Debt"): keynorm("Net Debt"),
        keynorm("Liquidity"): keynorm("Liquidity"),
        keynorm("EBITDA"): keynorm("Adjusted EBITDA"),
        keynorm("Leverage"): keynorm("Leverage (Net Debt/EBITDA)"),
    }
    
    # Parse the markdown table from gpt_output
    import io, re
    import pandas as pd
    from docx import Document
    
    def norm(s: str) -> str:
        return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
    
    def keynorm(s: str) -> str:
        return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
    
    def set_cell_text(cell, text: str):
        if not cell.paragraphs:
            cell.add_paragraph(text)
            return
        p = cell.paragraphs[0]
        for run in p.runs:
            run.text = ""
        p.add_run(text)
    
    # Extract markdown table
    gpt_output = gpt_output.strip()
    lines = gpt_output.split('\n')
    
    # Find table boundaries
    csv_lines = []
    in_table = False
    for line in lines:
        line = line.strip()
        if line.startswith('| Metric |'):
            in_table = True
        if in_table and line.startswith('|') and '---' not in line:
            cells = [c.strip() for c in line.strip('|').split('|')]
            csv_lines.append(cells)
        elif in_table and not line.startswith('|'):
            break
    
    if len(csv_lines) < 2:
        raise ValueError("Could not parse capital structure table")
    
    # Convert to dict: normalized metric -> FY24 value
    header = csv_lines[0]
    data_rows = csv_lines[1:]
    
    csv_data = {}
    for row in data_rows:
        if len(row) >= 2:
            metric = keynorm(row[0])
            value_fy24 = row[1] if len(row) > 1 else "n.a."
            csv_data[metric] = value_fy24
    
    print(f"DEBUG: Parsed {len(csv_data)} metrics from CSV")
    print(f"CSV metrics: {list(csv_data.keys())[:5]}...")
    
    # Load document
    if doc is None:
        doc = Document(doc_path)
    
    # Find capital structure table (Table 2 based on inspection)
    table = None
    for tbl in doc.tables:
        if len(tbl.rows) > 5:
            # Check if it has capital structure metrics
            row_texts = [tbl.rows[i].cells[0].text.strip().lower() for i in range(min(5, len(tbl.rows)))]
            if any('facility' in t or 'leverage' in t or 'ebitda' in t for t in row_texts):
                table = tbl
                break
    
    if table is None:
        raise RuntimeError("Could not locate capital structure table")
    
    print(f"DEBUG: Found table with {len(table.rows)} rows")
    
    # Populate table
    for row_idx in range(1, len(table.rows)):  # Skip header
        row = table.rows[row_idx]
        label_text = row.cells[0].text.strip()
        label_norm = keynorm(label_text)
        
        print(f"DEBUG: Processing row '{label_text}' (normalized: '{label_norm}')")
        
        # Try metric mapping first
        matched_value = None
        if label_norm in metric_mapping:
            csv_metric = metric_mapping[label_norm]
            print(f"  Mapped to CSV metric: '{csv_metric}'")
            if csv_metric in csv_data:
                matched_value = csv_data[csv_metric]
                print(f"  Found value: '{matched_value}'")
        
        # Try direct match
        if matched_value is None and label_norm in csv_data:
            matched_value = csv_data[label_norm]
            print(f"  Direct match found: '{matched_value}'")
        
        # Populate cell
        if matched_value and len(row.cells) > 1:
            set_cell_text(row.cells[1], matched_value)
            print(f"  ✓ Set cell value")
        else:
            print(f"  ✗ No match found")
    
    return doc

In [21]:
# =========================
# TEST: insert_stakeholders
# =========================

# Sample GPT output (this is what your agent would return)
stakeholders_test_output = cap_stru

# Test the function
print("Testing insert_stakeholders...")
try:
    # Load your template document
    doc_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile (1).docx"
    
    # Call the function
    doc = insert_capital_structure_with_mapping(stakeholders_test_output, doc_path=doc_path)
    
    # Save to a test output file
    output_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/test_stakeholders.docx"
    doc.save(output_path)
    
    print(f"✓ Success! Open the file to check: {output_path}")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()

Testing insert_stakeholders...
✗ Error: local variable 'keynorm' referenced before assignment


Traceback (most recent call last):
  File "/var/folders/6k/jcthlrlx33v3xr_0cy_fm57c0000gn/T/ipykernel_79627/612376763.py", line 15, in <module>
    doc = insert_capital_structure_with_mapping(stakeholders_test_output, doc_path=doc_path)
  File "/var/folders/6k/jcthlrlx33v3xr_0cy_fm57c0000gn/T/ipykernel_79627/3181027580.py", line 9, in insert_capital_structure_with_mapping
    keynorm("Facility Name"): keynorm("Facility Name"),
UnboundLocalError: local variable 'keynorm' referenced before assignment


In [None]:
def break_content(gpt_output):

    parts = gpt_output.split("\n\nSummary / Interpretation", 1)
    csv_block = parts[0].strip()

    start = csv_block.find("Metric,")
    if start == -1:
        raise ValueError("CSV header 'Metric,' not found in model output.")
    csv_block = csv_block[start:]
    section_df = pd.read_csv(io.StringIO(csv_block))

    summary_text = ""
    if len(parts) > 1:
        summary_text = "Summary / Interpretation" + parts[1].rstrip()

    return summary_text, section_df

def insert_paragraph(document: Document, placeholder: str, new_text: str):

    def set_paragraph_multiline(paragraph, text: str):
        """Replace a paragraph's text with multi-line content, preserving line breaks."""
        # clear existing runs
        for run in paragraph.runs:
            run.text = ""
        # write lines with explicit line breaks
        lines = (text or "").splitlines()
        if not lines:
            return
        paragraph.add_run(lines[0])
        for ln in lines[1:]:
            r = paragraph.add_run()
            r.add_break(WD_BREAK.LINE)
            paragraph.add_run(ln)

    """Find placeholder in paragraphs/cells and replace it with new_text (multiline)."""
    # plain paragraphs
    for p in document.paragraphs:
        if placeholder in p.text:
            set_paragraph_multiline(p, new_text)
            return True
    # inside tables
    for tbl in document.tables:
        for row in tbl.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    if placeholder in p.text:
                        set_paragraph_multiline(p, new_text)
                        return True
    return False