In [27]:
import os, textwrap
import io

from typing import List, Dict, Optional
from xml.sax.saxutils import escape

from gpts.gpt_assistants import general_assistant
from dotenv import load_dotenv, find_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.core.exceptions import HttpResponseError
from azure.search.documents.models import HybridSearch


from openai import AzureOpenAI, APIConnectionError, OpenAI
from prompts import new_system_finance_prompt

from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

from prompts4 import finance_calculations, finance_pairs, capital_pairs, stakeholders_pairs, biz_overview_pairs, revenue_pairs, default_gpt_prompt, section4a, section4b, section5, section3
from pages.design.func_tools import *
from pages.design.formatting import *
from pages.design.func_tools import docx_bytes_to_pdf_bytes
import re, time
 
load_dotenv(find_dotenv(), override=True)

# ---- Config (expects the same envs you already used) ----
SEARCH_ENDPOINT = os.environ["AZURE_SEARCH_ENDPOINT"]
SEARCH_INDEX    = os.environ["AZURE_SEARCH_INDEX"]
SEARCH_KEY      = os.getenv("AZURE_SEARCH_API_KEY")  # omit if using AAD/RBAC
VECTOR_FIELD    = os.getenv("VECTOR_FIELD")
TEXT_FIELD      = os.getenv("TEXT_FIELD")

AOAI_ENDPOINT   = os.environ["AZURE_OPENAI_ENDPOINT"]            # https://<resource>.openai.azure.com
AOAI_API_VER    = os.environ.get("AZURE_OPENAI_API_VERSION", "2024-10-21")
AOAI_DEPLOYMENT = os.environ["AZURE_OPENAI_DEPLOYMENT"]          # e.g., gpt-4o-mini / o3-mini / gpt-5 preview
AOAI_KEY        = os.getenv("AZURE_OPENAI_API_KEY")              # omit if using AAD
OPENAI_API_KEY  = os.getenv("OPENAI_API_KEY")        # required

# ------------------ CODE

class profileAgent():

    """Hybrid (dense+sparse) RAG over Vector Store

    This Agent is responsible for creating Company Profiles. 
    It operates with gpt5.
    It is activated by a call on main rag when it is typed 'Create company profile'
    """

    def __init__(self, company_name, k, max_text_recall_size, max_chars, model, profile_prompt = new_system_finance_prompt, finance_calculations = finance_calculations):
        
        self.company_name = company_name

        self.k = k
        self.max_text_recall_size = max_text_recall_size
        self.model = model
        self.max_chars = max_chars

        self.azure_credentials = AzureKeyCredential(SEARCH_KEY) if SEARCH_KEY else DefaultAzureCredential()
        self.search_client = SearchClient(SEARCH_ENDPOINT, SEARCH_INDEX, credential=self.azure_credentials)

        self.az_openai = AzureOpenAI(azure_endpoint=AOAI_ENDPOINT, api_key=AOAI_KEY, api_version=AOAI_API_VER)
        self.profile_prompt = profile_prompt
        self.web_openai = OpenAI(api_key=OPENAI_API_KEY)

        self.reasoning_effort = "medium"
        self.verbosity = "medium"

        self.finance_calculations = finance_calculations

    def _company_filter(self) -> str:
        v = (self.company_name or "").replace("'", "''").strip()
        return f"company_name eq '{v}'" if v else None
    
    def assemble_bm25_from_llm(self, slots: dict) -> str:
        def q(s: str) -> str:
            # sanitize: remove internal quotes and trim
            s = (s or "").strip().replace('"', ' ')
            return f"\"{s}\"" if s else ""
        groups = []

        # must-have phrases (ANDed)
        for p in slots.get("must_have_phrases", []):
            qp = q(p)
            if qp:
                groups.append(qp)

        # metric / statement synonym groups (ORed within each group)
        for key in ["metric", "statement"]:
            syns = slots.get("synonyms", {}).get(key, []) or slots.get(key, [])
            syns = [q(s) for s in syns if s]
            if syns:
                groups.append("(" + " OR ".join(syns) + ")")

        return " AND ".join(groups) if groups else "\"financial statements\""


    def bm25_creator(self, prompt):

        instruction = (
            "Extract finance search slots for Azure AI Search. "
            "Return strict JSON: {\"metric\":[], \"statement\":[], \"synonyms\":{}, \"must_have_phrases\":[]} "
            "(include IFRS/US GAAP variants)."
        )
        resp = general_assistant(instruction, prompt, OPENAI_API_KEY, 'gpt-4o')

        try:
            slots = getattr(resp, "output_json", None)
            if slots is None:
                import json
                slots = json.loads(resp.output_text)
        except Exception:
            # fallback: minimal anchors from prompt
            slots = {"must_have_phrases": [prompt], "metric": [], "statement": [], "synonyms": {}}
        return self.assemble_bm25_from_llm(slots)

    def _retrieve_hybrid_enhanced(self, query_nl, k: int = 50, top_n = 30, fields=VECTOR_FIELD, max_text_recall_size:int = 800):
        sc = self.search_client
        flt = self._company_filter()
        
        try:
            vq = VectorizableTextQuery(text=query_nl, k=k, fields=VECTOR_FIELD)
            # Prefer vector-only search (integrated vectorization). If your index isn't set up for it, this raises.
            results = sc.search(
                search_text=self.bm25_creator(query_nl), 
                vector_queries=[vq], 
                top=top_n, 
                query_type="semantic",
                query_caption="extractive", 
                hybrid_search=HybridSearch(max_text_recall_size=self.max_text_recall_size),
                query_caption_highlight_enabled=True,
                filter=flt
                )
            mode = "hybrid + semantic"
        except HttpResponseError as e:
            # Fall back to lexical so you still get results while fixing vector config
            results = sc.search(search_text=self.bm25_creator(query_nl), top=k)
            mode = f"lexical (fallback due to: {e.__class__.__name__})"

        hits: List[Dict] = []
        for r in results:
            d = r.copy() if hasattr(r, "copy") else {k2: r[k2] for k2 in r}
            d["score"] = d.get("@search.reranker_score") or d.get("@search.score") or 0.0
            caps = d.get("@search.captions")
            if isinstance(caps, list) and caps:
                d["caption"] = getattr(caps[0], "text", None)
            hits.append(d)

        return mode, hits


    def _build_context(self, hits: List[Dict], text_field: str = TEXT_FIELD, max_chars: int = 20000):
        """Build a compact, numbered context block and also return the selected chunk metadata."""
        lines = []
        total = 0
        selected = []  # <- we'll return this

        for i, h in enumerate(hits, 1):
            title     = h.get("title")
            chunk_id  = h.get("chunk_id")
            full_text = (h.get(text_field) or "")
            if not full_text:
                continue

            preview = textwrap.shorten(full_text, width=700, placeholder=" ...")
            block = f"[{i}] title={title!r} | chunk_id={chunk_id} | score={h.get('score'):.4f}\n{full_text}"

            if total + len(block) > self.max_chars:
                break

            total += len(block)
            lines.append(block)

            # keep rich metadata so you can show or log it later
            selected.append({
                "i": i,
                "title": title,
                "chunk_id": chunk_id,
                "score": h.get("score"),
                "caption": h.get("caption"),
                "preview": preview,
                "text": full_text,  # full chunk text (not shortened)
                # include any other fields you index, if available:
                "metadata_storage_path": h.get("metadata_storage_path"),
                "page_number": h.get("page_number"),
                "doc_type": h.get("doc_type"),
            })

        return "\n\n---\n\n".join(lines), selected

        
    def _generate_pdf(self, text: str) -> bytes:

        buf = io.BytesIO()
        doc = SimpleDocTemplate(buf, pagesize=letter)
        styles = getSampleStyleSheet()
        body = styles["BodyText"]

        story = []
        # Treat double newlines as paragraph breaks; keep single newlines as <br/>
        for para in (text or "").split("\n\n"):
            safe = escape(para).replace("\n", "<br/>")
            story.append(Paragraph(safe if safe.strip() else "&nbsp;", body))
            story.append(Spacer(1, 8))

        doc.build(story)
        buf.seek(0)
        return buf.getvalue()
    
    def _extract_cited_idxs(self, answer: str) -> list[int]:
        # Matches [#1], [#12], etc. (also tolerates stray [1])
        nums = set(int(n) for n in re.findall(r"\[#?(\d+)\]", answer))
        return sorted(nums)

    def _rag_answer(self, rag_nl, question, k: int = 5, temperature: float = 0.2):

        # question = f'CREATE A SECTION OF COMPANY PROFILE USING LAST YEARS OF ANNUAL REPORT PRESENT IN THE CONTEXT FOR {self.company_name}. IF ANY INFORMATION IS NOT FOUND STATE AS n.a. .\n\n THIS IS THE SECTION TO BE BUILT: \n {section7}  \n USE THIS TO GUIDE YOURSELF ON SEMANTIC TERMS AND HOW TO CALCULATE: \n {finance_calculations}'
        
        mode, hits = self._retrieve_hybrid_enhanced(
            # query=rag_q, 
            query_nl=rag_nl,
            k=25
            )
        ctx_text, ctx_items = self._build_context(hits)

        system_msg = self.profile_prompt + (
            "\nWhen you use a fact from the context, add citations like [#1], [#2]."
            "\nOnly rely on the numbered context; if a value is missing, say 'n.a.'."
            f"\nIF ANY INFORMATION IS NOT FOUND STATE AS n.a. .\n\n USE THIS TO GUIDE YOURSELF ON SEMANTIC TERMS AND HOW TO CALCULATE: \n {finance_calculations}"
        )
        user_msg = f"Question:\n{question}\n\nContext snippets (numbered):\n{ctx_text}"

        client = self.az_openai
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user",   "content": user_msg},
        ]

        # Try streaming first (SSE). Some networks/proxies block streaming; if so, fall back.
        
        resp = client.chat.completions.create(
            model=AOAI_DEPLOYMENT,
            messages=messages,
            reasoning_effort="high"
        )
        answer = resp.choices[0].message.content
        mode_model = "non-streaming (fallback)"

        cited = self._extract_cited_idxs(answer)
        used_chunks = [c for c in ctx_items if c["i"] in cited]

        # return self._generate_pdf(answer)
        return {
            "answer": answer,
            "citations": cited,          # [1, 3, 7]
            "used_chunks": used_chunks,  # detailed dicts for each cited snippet
            "all_chunks": ctx_items,     # everything you sent (optional)
            "mode": mode                 # retrieval mode info (optional)
        }

    def _web_search(self, messages):
        resp = self.web_openai.responses.create(
            model='gpt-5',
            input=messages,
            tools=[{"type": "web_search"}],
            tool_choice="auto",
            # max_output_tokens=self.max_output_tokens,
            reasoning={"effort": self.reasoning_effort},
            text={"verbosity": self.verbosity},
        )
        
        return resp.output_text
    
    def _answer(self, question, ctx_text, k: int = 5, temperature: float = 0.2):

        system_msg = self.profile_prompt + (
            "\nWhen you use a fact from the context, add citations like [#1], [#2]."
            "\nOnly rely on the numbered context; if a value is missing, say 'n.a.'."
        )
        user_msg = f"Question:\n{question}\n\nContext snippets (numbered):\n{ctx_text}"

        client = self.az_openai
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user",   "content": user_msg},
        ]

        # Try streaming first (SSE). Some networks/proxies block streaming; if so, fall back.
        
        resp = client.chat.completions.create(
            model=AOAI_DEPLOYMENT,
            messages=messages,
            reasoning_effort="high"
        )
        answer = resp.choices[0].message.content

        cited = self._extract_cited_idxs(answer)

        # return self._generate_pdf(answer)
        return {
            "answer": answer,
            "citations": cited,          # [1, 3, 7]
        }   
    
    @staticmethod
    def has_na(text: str) -> bool:
        # match "n.a." or "n/a" (case-insensitive)
        return bool(re.search(r"\b(n\.a\.|n/a)\b", text, flags=re.I))

    def _sections(self, pairs):

        answers = []

        max_extra_na_retries = 1        # try again at most 2 times (total <= 3 calls per item)
        base_delay_seconds = 3.0        # polite delay between attempts


        for q, r in pairs:
            tries = 0
            while True:
                if tries > 0:
                    # small incremental delay before re-trying
                    time.sleep(base_delay_seconds + 0.5 * tries)

                resp = self._rag_answer(rag_nl=r[0], question=q[0])
                answer_text = resp["answer"]

                # stop if good answer OR we've exhausted retries
                if not profileAgent.has_na(answer_text) or tries >= max_extra_na_retries:
                    answers.append(answer_text)
                    break

                # otherwise, try again
                tries += 1

            # optional small gap between different (r,q) items
            time.sleep(5.0)
        
        return answers
    
    def _generate_section(self, section):

        if section == 'GENERATE BUSINESS OVERVIEW':
            # =========== GENERATE BUSINESS OVERVIEW
            biz_overview_pairs_flat = list(zip(biz_overview_pairs[1], biz_overview_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs = biz_overview_pairs_flat)
            resp = self._answer(question=business_overview_formatting, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE KEY STAKEHOLDERS':
        # =========== GENERATE KEY STAKEHOLDERS
            stakeholders_pairs_flat = list(zip(stakeholders_pairs[1], stakeholders_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs= stakeholders_pairs_flat)
            resp = self._answer(question=stakeholders_formatting_2, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE FINANCIAL HIGHLIGHTS':
            # =========== GENERATE FINANCIAL HIGHLIGHTS
            finance_pairs_flat = list(zip(finance_pairs[1], finance_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs=finance_pairs_flat)
            resp = self._answer(question=finance_formatting_2, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE CAPITAL STRUCTURE':
            # =========== GENERATE CAPITAL STRUCTURE
            capital_pairs_flat = list(zip(capital_pairs[1], capital_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs= capital_pairs_flat)
            resp = self._answer(question=capital_structure_formatting_2, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE REVENUE SPLIT':
            # =========== GENERATE CAPITAL STRUCTURE
            revenue_pairs_flat = list(zip(revenue_pairs[1], revenue_pairs[0]))  # [(r, q), (r, q), ...]
            section_built = self._sections(pairs= revenue_pairs_flat)
            resp = self._answer(question=section3, ctx_text=section_built)
            return resp['answer']
        elif section == 'GENERATE PRODUCTS SERVICES OVERVIEW':
            # =========== GENERATE CAPITAL STRUCTURE
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{section4a}'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp = self._web_search(messages)
            return resp 
        elif section == 'GENERATE GEO FOOTPRINT':
            # =========== GENERATE CAPITAL STRUCTURE
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{section4b}'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp = self._web_search(messages)
            return resp
        elif section == 'GENERATE DEVELOPMENTS HIGHLIGHTS':
            # =========== GENERATE CAPITAL STRUCTURE
            new_section = f'All instructions applies to the company: {self.company_name}\n\n{section5}'
            messages = [
                {"role": "system", "content": default_gpt_prompt},
                {"role": "user",   "content": new_section},
            ]
            resp = self._web_search(messages)
            return resp


    def generate_company_profile(self):

        # =========== GENERATE BUSINESS OVERVIEW
        biz_overview_pairs_flat = list(zip(biz_overview_pairs[1], biz_overview_pairs[0]))  # [(r, q), (r, q), ...]
        section1 = self._sections(pairs = biz_overview_pairs_flat)
        resp = self._answer(question=business_overview_formatting, ctx_text=section1)
        doc = insert_biz_overview(resp['answer'])

        time.sleep(60)
        # # =========== GENERATE KEY STAKEHOLDERS
        # stakeholders_pairs_flat = list(zip(stakeholders_pairs[1], stakeholders_pairs[0]))  # [(r, q), (r, q), ...]
        # section2 = self._sections(pairs= stakeholders_pairs_flat)
        # resp = self._answer(question=stakeholders_formatting, ctx_text=section2)
        # doc = insert_stakeholders(resp['answer'], doc=doc)
        
        # # time.sleep(60)
        # # =========== GENERATE FINANCIAL HIGHLIGHTS
        # finance_pairs_flat = list(zip(finance_pairs[1], finance_pairs[0]))  # [(r, q), (r, q), ...]
        # section3 = self._sections(pairs=finance_pairs_flat)
        # resp = self._answer(question=finance_formatting, ctx_text=section3)
        # doc = insert_finance(resp['answer'], doc=doc)

        # time.sleep(60)
        # # =========== GENERATE CAPITAL STRUCTURE
        # capital_pairs_flat = list(zip(capital_pairs[1], capital_pairs[0]))  # [(r, q), (r, q), ...]
        # section4 = self._sections(pairs= capital_pairs_flat)
        # resp = self._answer(question=capital_structure_formatting, ctx_text=section4)
        # doc = insert_capital_structure(resp['answer'], doc=doc)
        # time.sleep(60)
        # ============ 'GENERATE REVENUE SPLIT'
        revenue_pairs_flat = list(zip(revenue_pairs[1], revenue_pairs[0]))  # [(r, q), (r, q), ...]
        section_built = self._sections(pairs= revenue_pairs_flat)
        resp = self._answer(question=section3, ctx_text=section_built)
        doc = insert_revenue_split(resp['answer'], doc=doc) 
        time.sleep(60)
        # =========== 'GENERATE PRODUCTS SERVICES OVERVIEW':
        # new_section = f'All instructions applies to the company: {self.company_name}\n\n{section4a}'
        # messages = [
        #     {"role": "system", "content": default_gpt_prompt},
        #     {"role": "user",   "content": new_section},
        # ]
        # resp = self._web_search(messages)
        # doc = insert_services_overview(resp, doc=doc)
        # time.sleep(60)
        # # ========= 'GENERATE GEO FOOTPRINT':
        # new_section = f'All instructions applies to the company: {self.company_name}\n\n{section4b}'
        # messages = [
        #     {"role": "system", "content": default_gpt_prompt},
        #     {"role": "user",   "content": new_section},
        # ]
        # resp = self._web_search(messages)
        # doc = insert_geo_footprint(resp, doc=doc)
        # time.sleep(60)
        # # ======== 'GENERATE DEVELOPMENTS HIGHLIGHTS':
        # new_section = f'All instructions applies to the company: {self.company_name}\n\n{section5}'
        # messages = [
        #     {"role": "system", "content": default_gpt_prompt},
        #     {"role": "user",   "content": new_section},
        # ]
        # resp = self._web_search(messages)
        # doc = insert_development_highlights(resp, doc=doc)
        # time.sleep(60)
    
        pdf_bytes = docx_bytes_to_pdf_bytes(doc)

        return pdf_bytes, doc
        # =========== UNION


In [28]:
# from pages.design.dialogues import *
from prompts4 import section7, finance_calculations, system_mod
from azure.blob_functions import get_companies

name_map, names = get_companies()

name_map

{'JAMES_DONALDSON_GROUP_LTD': 'JAMES DONALDSON GROUP LTD',
 'RADLEY_+_CO._LIMITED': 'RADLEY + CO. LIMITED',
 'SEAPORT_TOPCO_LIMITED': 'SEAPORT TOPCO LIMITED',
 'ASCOT_LLOYD_LIMITED': 'ASCOT LLOYD LIMITED',
 'VITA_(HOLDINGS)_LIMITED': 'VITA (HOLDINGS) LIMITED'}

In [29]:
agent = profileAgent(
            company_name = 'JAMES_DONALDSON_GROUP_LTD',
            k=50, 
            max_text_recall_size=35, 
            max_chars=10000,
            model='gpt-5', 
            profile_prompt= system_mod, 
            finance_calculations= finance_calculations
        )

In [30]:
pdf, doc = agent.generate_company_profile()

k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored
k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored


Updated document written


k is not a known attribute of class <class 'azure.search.documents._generated.models._models_py3.VectorizableTextQuery'> and will be ignored


Updated document written
[docx2pdf] conversion failed: TypeError("memoryview: a bytes-like object is required, not 'Document'")


In [31]:
doc.save("report.docx")

# Business Overview

In [None]:
import io, re
from docx import Document
from docx.enum.text import WD_BREAK

# =========================
# 0) Your full GPT response
# =========================
gpt_output = r"""
- Seaport Topco Limited’s principal activity continues to include the research and development of pharmaceutical instrumentation. [1][2]
- The company is engaged in group-level operations focused on developing pharmaceutical instrumentation through ongoing research and development initiatives. [1][2]
- It offers research and development of pharmaceutical instrumentation as its primary product and service offering. [1][2]
- The company reported a headcount of 418 at Dec-24, up from 405, indicating ongoing operations across the group. [9]
- The company faces near-term liquidity and working capital pressure, with creditors due within one year rising to £64.7m at Dec-24 driven by £7.7m of new loans and a £9.2m net revolving credit facility draw, while cash generated from operating activities declined to £10.4m from £15.3m in FY23. [9]
- It carries substantial longer-term obligations, with creditors due after more than one year at £168.7m at Dec-23, alongside rising trade debtors to £17.0m and reduced stock to £13.2m at Dec-24, indicating cash conversion and balance sheet pressures. [7][8][9]

Sources:
- [1][2] Annual Report (FY23), Strategic/Directors’ Report, p.5 — Principal activity.
- [9] Annual Report (FY24), Group Strategic Report (pp.1–5; exact page n.a.) — Review of the business and future developments, working capital and liquidity movements.
- [7][8] Annual Report (FY23), Notes 21–22, Consolidated Statement of Financial Position, p.14 — Creditors due within one year and after more than one year.
""".strip("\n")

# =========================
# 1) Open DOCX
# =========================
doc_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile2.docx"
doc = Document(doc_path)

PLACEHOLDER = "[INSERT BUSINESS OVERVIEW]"

def set_paragraph_multiline(paragraph, text: str):
    """Replace a paragraph's text with multi-line content, preserving line breaks."""
    # clear existing runs
    for run in paragraph.runs:
        run.text = ""
    # write lines with explicit line breaks
    lines = (text or "").splitlines()
    if not lines:
        return
    paragraph.add_run(lines[0])
    for ln in lines[1:]:
        r = paragraph.add_run()
        r.add_break(WD_BREAK.LINE)
        paragraph.add_run(ln)

def replace_placeholder(document: Document, placeholder: str, new_text: str) -> bool:
    """Find placeholder in paragraphs/cells and replace it with new_text (multiline)."""
    # plain paragraphs
    for p in document.paragraphs:
        if placeholder in p.text:
            set_paragraph_multiline(p, new_text)
            return True
    # inside tables
    for tbl in document.tables:
        for row in tbl.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    if placeholder in p.text:
                        set_paragraph_multiline(p, new_text)
                        return True
    return False

# =========================
# 2) Replace the placeholder
# =========================
ok = replace_placeholder(doc, PLACEHOLDER, gpt_output)
if not ok:
    print(f"WARNING: placeholder not found: {PLACEHOLDER}")

# =========================
# 3) Save
# =========================
out_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile2.docx"
doc.save(out_path)
print(f"Updated document written to: {out_path}")


# Capital Structure

In [None]:
import io, re
import pandas as pd
from docx import Document
from docx.enum.text import WD_BREAK

# =========================
# 0) Your full GPT response
# =========================
gpt_output = r"""
Metric,FY24,FY23,FY22
"Facility Name","n.a.","Facility B1 (EUR term loan); Facility B2 (USD term loan); Revolving Credit Facility; Delayed Drawdown Facility [#2][#3]","Revolving Credit Facility; Delayed Drawdown Facility [#3]"
"Interest Rate","n.a.","Euribor + 6.25%; Term SOFR + 6.25% [#2][#3]","n.a."
Maturity,"n.a.","Aug-29 [#2][#3]","n.a."
"Adjusted EBITDA","n.a.","£17.9m [#2][#3]","£10.6m [#2][#3]"
"Cash (Closing Cash)","n.a.","£11.8m [#2][#3]","n.a."
"Net Debt","n.a.","£171.9m [#2][#3]","n.a."
Liquidity,"n.a.","£25.8m [#2][#3]","n.a."
"Leverage (Net Debt/EBITDA)","n.a.","9.6x [#2][#3]","n.a."
"Facility B1 outstanding (GBP)","n.a.","£36.0m [#2][#3]","n.a."
"Facility B2 outstanding (GBP)","n.a.","£135.0m [#2][#3]","n.a."
"RCF drawn","n.a.","£16.0m [#2][#3]","£16.0m [#3]"
"RCF facility size","n.a.","£30.0m [#2][#3]","£30.0m [#3]"
"Delayed Drawdown Facility size","n.a.","£75.0m [#2][#3]","£75.0m [#3]"
"Bank loans due after >5 years","n.a.","£168.7m [#2][#3]","n.a."
"Bank loans due within 1 year","n.a.","£14.7m [#2][#3]","n.a."
"Bank loans + RCF outstanding (excl. leases)","n.a.","£187.0m [#2][#3]","n.a."

Summary / Interpretation
- FY23 leverage is high at 9.6x, based on £171.9m net debt and £17.9m Adjusted EBITDA.
- FY23 facility mix is dominated by term loans (Facility B1 ~£36.0m and B2 ~£135.0m) maturing in Aug-29, plus a £30.0m RCF (of which £16.0m was drawn) and a £75.0m delayed draw facility.
- FY23 liquidity appears modest at £25.8m, combining £11.8m closing cash with remaining headroom on the £30.0m RCF (drawn £16.0m).
- Maturity profile in FY23 is back-ended: £168.7m due after >5 years versus £14.7m due within 1 year; both term facilities mature in Aug-29.
- Total FY23 bank loans + RCF outstanding (excl. leases) sums to £187.0m, indicating a sizeable secured debt stack.
- FY24 disclosures are not available in the provided excerpts, and FY22 data is limited beyond RCF details.

Sources
- [#2] Seaport Topco Limited Annual Report (file date 25-Sep-24), pp. 8, 45, 52 — p.8 Adjusted EBITDA (£17.9m FY23; £10.6m FY22); p.45 Loans note (Facility B1/B2 amounts, Aug-29 maturities, interest margins; RCF £30.0m and ~£16.0m drawn; £75.0m delayed draw facility; bank loans due >5 years £168.7m and within 1 year £14.7m); p.52 Net debt analysis (Net Debt £171.9m; closing cash £11.8m). Link: https://aiprojectteneo.blob.core.windows.net/companieshouselinglefile/14171962/SEAPORT_TOPCO_LIMITED_AA_annualReport_2024-09-25_0.pdf
- [#3] Seaport Topco Limited Annual Report (file date 25-Sep-24), pp. 8, 36, 45, 52 — corroborates Adjusted EBITDA figures; Going concern (p.36) notes RCF drawn £16.0m at Dec-22; Loans note (p.45) for facility sizes/draws and maturities; Net debt analysis (p.52) for Net Debt and closing cash. Link: https://aiprojectteneo.blob.core.windows.net/companieshousinglefile/14171962/SEAPORT_TOPCO_LIMITED_AA_annualReport_2024-09-25_1.pdf
"""

# =========================
# 1) Extract CSV + SUMMARY  (KEEP sources)
# =========================
parts = gpt_output.split("\n\nSummary / Interpretation", 1)
csv_block = parts[0].strip()

start = csv_block.find("Metric,")
if start == -1:
    raise ValueError("CSV header 'Metric,' not found in model output.")
csv_block = csv_block[start:]

summary_text = ""
if len(parts) > 1:
    summary_text = "Summary / Interpretation" + parts[1].rstrip()

# =========================
# 2) Parse CSV to DataFrame
# =========================
df = pd.read_csv(io.StringIO(csv_block))
expected_cols = {"Metric","FY24","FY23","FY22"}
if not expected_cols.issubset(df.columns):
    raise ValueError(f"Capital Structure CSV columns missing. Got: {list(df.columns)}")

csv_rows = {
    str(df.at[i, "Metric"]).strip(): {
        "FY24": str(df.at[i, "FY24"]),
        "FY23": str(df.at[i, "FY23"]),
        "FY22": str(df.at[i, "FY22"]),
    }
    for i in range(len(df))
}

# =========================
# 3) Open DOCX, locate the Capital Structure table
# =========================
doc_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile2.docx"
doc = Document(doc_path)

# ----------------- helpers -----------------
def norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

def tokens(s: str) -> set:
    return set(re.findall(r"[a-z0-9]+", (s or "").lower()))

def jaccard(a: str, b: str) -> float:
    ta, tb = tokens(a), tokens(b)
    if not ta or not tb:
        return 0.0
    inter = len(ta & tb)
    union = len(ta | tb)
    return inter / union if union else 0.0

def find_cap_struct_table(document: Document):
    # 1) after 'Capital Structure' heading
    found_heading = False
    body = document._element.body
    for child in body.iterchildren():
        tag = child.tag.rsplit("}", 1)[-1]
        if tag == "p":
            p_text = "".join(t.text for t in child.iter()
                             if t.tag.rsplit("}",1)[-1] == "t").strip()
            if norm(p_text) == "capitalstructure":
                found_heading = True
        elif tag == "tbl" and found_heading:
            from docx.table import Table
            return Table(child, document)

    # 2) heuristic by content
    for tbl in document.tables:
        row_texts = [" ".join(c.text for c in r.cells) for r in tbl.rows]
        joined = " ".join(row_texts)
        if all(x in norm(joined) for x in ["ebitda","leverage"]):
            return tbl
    return None

table = find_cap_struct_table(doc)
if table is None:
    raise RuntimeError("Could not locate the 'Capital Structure' table.")

# Identify FY columns
def find_fy_cols(tbl):
    for r_i in range(min(2, len(tbl.rows))):
        labels = [norm(c.text) for c in tbl.rows[r_i].cells]
        loc = {}
        for idx, txt in enumerate(labels):
            if txt == "fy24": loc["FY24"] = idx
            if txt == "fy23": loc["FY23"] = idx
            if txt == "fy22": loc["FY22"] = idx
        if {"FY24","FY23","FY22"}.issubset(loc.keys()):
            return loc["FY24"], loc["FY23"], loc["FY22"]
    if len(tbl.rows[0].cells) >= 4:
        return 1, 2, 3
    raise RuntimeError("Could not determine FY columns in Capital Structure table.")

col_FY24, col_FY23, col_FY22 = find_fy_cols(table)

# Build row index from first column (labels)
doc_row_index = {}
doc_row_labels = {}  # norm_label -> raw label (for debug)
for r_idx, row in enumerate(table.rows):
    if not row.cells:
        continue
    label_raw = row.cells[0].text.strip()
    if label_raw:
        key = norm(label_raw)
        doc_row_index[key] = r_idx
        doc_row_labels[key] = label_raw

# =========================
# 4) Mapping (CSV -> DOC label), with synonyms & typo tolerance
# =========================
def keynorm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

# Add broad synonyms, incl. likely template wordings
metric_to_doc_syns = {
    keynorm("Facility Name"): [
        "Facility Name", "Name of the Facility", "Facility", "Facilities", "Facility Names",
        "Name of Facility"
    ],
    keynorm("Interest Rate"): ["Interest Rate", "Interst Rate", "Rate", "Interest"],
    keynorm("Interst Rate"):  ["Interest Rate", "Interst Rate", "Rate", "Interest"],
    keynorm("Maturity"): ["Maturity", "Final Maturity", "Maturities", "Maturity Date"],
    keynorm("Adjusted EBITDA"): ["EBITDA", "Adjusted EBITDA"],
    keynorm("Cash (Closing Cash)"): [
        "Cash (Closing Cash)", "Cash (Closing cash)", "Closing Cash",
        "Cash", "Cash and cash equivalents", "Cash & cash equivalents"
    ],
    keynorm("Net Debt"): ["Net External Debt", "Net Debt"],
    keynorm("Liquidity"): ["Liquidity"],
    keynorm("Leverage (Net Debt/EBITDA)"): ["Leverage"],
    keynorm("Leverage (Net Debt / EBITDA)"): ["Leverage"],
    keynorm("Bank loans + RCF outstanding (excl. leases)"): ["Gross External Debt", "Total External Debt"],

    keynorm("Facility B1 outstanding (GBP)"): ["Amount Outstanding"],
    keynorm("Facility B2 outstanding (GBP)"): ["Amount Outstanding"],
    keynorm("RCF drawn"): ["Amount Outstanding"],

    keynorm("RCF facility size"): [None],
    keynorm("Delayed Drawdown Facility size"): [None],
    keynorm("Bank loans due after >5 years"): [None],
    keynorm("Bank loans due within 1 year"): [None],
}

def smart_lookup_row_index(label_candidates):
    """
    Resolve to the best row index by:
      1) Exact normalized match
      2) Contains match (both directions)
      3) Token Jaccard similarity >= 0.5
    Returns row_index or None.
    """
    cand_norms = [norm(c) for c in label_candidates if c]

    # 1) exact
    for cn in cand_norms:
        if cn in doc_row_index:
            return doc_row_index[cn]

    # 2) contains (prefer the longest doc label match)
    best_idx = None
    best_len = -1
    for cn in cand_norms:
        for dl_norm, idx in doc_row_index.items():
            if cn and (cn in dl_norm or dl_norm in cn):
                if len(dl_norm) > best_len:
                    best_idx, best_len = idx, len(dl_norm)
    if best_idx is not None:
        return best_idx

    # 3) token overlap
    best_idx = None
    best_score = 0.0
    for cn in cand_norms:
        for dl_norm, idx in doc_row_index.items():
            score = jaccard(cn, dl_norm)
            if score >= 0.5 and score > best_score:
                best_idx, best_score = idx, score
    return best_idx

# =========================
# 5) Populate the table (incl. Facility Name / Interest Rate / Maturity)
#    and aggregate facility amounts into 'Amount Outstanding'
# =========================
agg_amount = {"FY24": [], "FY23": [], "FY22": []}

def maybe_append(prefix, v):
    v = (v or "").strip()
    if not v or v.lower() == "n.a.":
        return None
    return f"{prefix}: {v}"

unmapped_metrics = []

for csv_metric, years in csv_rows.items():
    mkey = keynorm(csv_metric)
    syns = metric_to_doc_syns.get(mkey, [csv_metric])

    # aggregated targets
    if any((s and norm(s) == norm("Amount Outstanding")) for s in syns if s):
        if mkey == keynorm("Facility B1 outstanding (GBP)"):
            for fy in ("FY24", "FY23", "FY22"):
                s = maybe_append("B1", years[fy]);  agg_amount[fy].append(s) if s else None
        elif mkey == keynorm("Facility B2 outstanding (GBP)"):
            for fy in ("FY24", "FY23", "FY22"):
                s = maybe_append("B2", years[fy]);  agg_amount[fy].append(s) if s else None
        elif mkey == keynorm("RCF drawn"):
            for fy in ("FY24", "FY23", "FY22"):
                s = maybe_append("RCF drawn", years[fy]);  agg_amount[fy].append(s) if s else None
        continue

    r_idx = smart_lookup_row_index(syns)
    if r_idx is None:
        unmapped_metrics.append(csv_metric)
        continue

    row = table.rows[r_idx]
    row.cells[col_FY24].text = years["FY24"]
    row.cells[col_FY23].text = years["FY23"]
    row.cells[col_FY22].text = years["FY22"]

# Aggregated 'Amount Outstanding'
amount_row_idx = smart_lookup_row_index(["Amount Outstanding"])
if amount_row_idx is not None:
    row = table.rows[amount_row_idx]
    row.cells[col_FY24].text = "; ".join(agg_amount["FY24"]) if agg_amount["FY24"] else "n.a."
    row.cells[col_FY23].text = "; ".join(agg_amount["FY23"]) if agg_amount["FY23"] else "n.a."
    row.cells[col_FY22].text = "; ".join(agg_amount["FY22"]) if agg_amount["FY22"] else "n.a."

# =========================
# 6) Insert the full SUMMARY (including Sources) — no duplicates
# =========================
PLACEHOLDER = "[INSERT CAPITAL STRUCTURE SUMMARY]"
HEADING_TEXT = "Summary / Interpretation"

def set_paragraph_multiline(paragraph, text: str):
    for run in paragraph.runs:  # clear
        run.text = ""
    lines = (text or "").splitlines()
    if not lines:
        return
    paragraph.add_run(lines[0])
    for ln in lines[1:]:
        paragraph.add_run().add_break(WD_BREAK.LINE)
        paragraph.add_run(ln)

def replace_placeholder(document: Document, placeholder: str, new_text: str) -> bool:
    # paragraphs
    for p in document.paragraphs:
        if placeholder in p.text:
            set_paragraph_multiline(p, new_text)
            return True
    # tables
    for tbl in document.tables:
        for row in tbl.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    if placeholder in p.text:
                        set_paragraph_multiline(p, new_text)
                        return True
    return False

def find_all_summary_paragraphs(document: Document, heading_text: str):
    anchors = []
    for p in document.paragraphs:
        if heading_text in p.text:
            anchors.append(p)
    for tbl in document.tables:
        for row in tbl.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    if heading_text in p.text:
                        anchors.append(p)
    return anchors

insert_done = False
if summary_text:
    insert_done = replace_placeholder(doc, PLACEHOLDER, summary_text)

if summary_text and not insert_done:
    anchors = find_all_summary_paragraphs(doc, HEADING_TEXT)
    if anchors:
        # overwrite first and remove extras
        set_paragraph_multiline(anchors[0], summary_text)
        for dup in anchors[1:]:
            dup._element.getparent().remove(dup._element)
    else:
        p = doc.add_paragraph()
        set_paragraph_multiline(p, summary_text)
        # ensure no accidental multiples
        anchors = find_all_summary_paragraphs(doc, HEADING_TEXT)
        for dup in anchors[1:]:
            dup._element.getparent().remove(dup._element)

# =========================
# 7) Save + (optional) debug
# =========================
out_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile2.docx"
doc.save(out_path)
print(f"Updated document written to: {out_path}")

if unmapped_metrics:
    print("NOTE — CSV metrics that couldn't be matched to any row (check your template labels):")
    for m in unmapped_metrics:
        print(" -", m)


# Key Stakeholders

In [None]:
import io, re
import pandas as pd
from docx import Document
from docx.enum.text import WD_BREAK

# =========================
# 0) Your full GPT response
# =========================
gpt_output = r"""
Metric,Shareholders
"Shareholders",n.a.
"Management","Directors (FY24): M Bauer; R Diggelmann — resigned Dec-24; P Dowdy — resigned Feb-25; J Feldman; R Friel — resigned Mar-25; K Murphy; D Newble — resigned Jul-24; A Thorburn; R Walton — appointed Jul-24"
"Lenders",n.a.
"Auditors","Grant Thornton UK LLP (Statutory Auditor); Stephen Wyborn (Senior Statutory Auditor)"
"Advisors","Facility agent: Kroll Agency Services Limited; Bankers: n.a.; Solicitors: n.a.; Financial advisor: n.a."

Summary / Interpretation
- Shareholder information is n.a., indicating no disclosed immediate or ultimate parent in the provided filings.
- Management is represented by the FY24 directors list; specific Chairman/CEO/CFO titles are not provided.
- Lenders are n.a., suggesting no disclosed bank facilities/borrowings in the available excerpts.
- Auditors are identified (Grant Thornton UK LLP; Senior Statutory Auditor Stephen Wyborn), while most other advisors are n.a. except the facility agent (Kroll Agency Services Limited).
- Advisor disclosure is limited (bankers/solicitors/financial advisor n.a.), constraining visibility into counterparties.

SECTION 3 - SOURCES
- [#1] Seaport Topco Limited AA Annual Report (published Sep-25), Directors’ Report for the year ended 31 Dec-24, p.12. Link: https://aiprojectteneo.blob.core.windows.net/companieshousesinglefile/14171962/SEAPORT_TOPCO_LIMITED_AA_annualReport_2025-09-30_0.pdf — used for the “Management” (directors) row.
- [#2] Seaport Topco Limited AA Annual Report (2024-09-25) – provided excerpt indicating Company Information page not available — supports n.a. for certain advisor details.
- [#3] Seaport Topco Limited annual report (FY24), Notes to the Financial Statements – Accounting policies excerpt (page n.a.) — used to check terminology (bank loans/borrowings); lenders n.a. and facility agent reference noted elsewhere.
- [#4] Seaport Topco Limited Annual Report 2024, Independent Auditors’ Report signature block, file: SEAPORT_TOPCO_LIMITED_AA_annualReport_2024-09-25_1.pdf (26 pages), signed Apr-24 — used for “Auditors” identification and to support shareholder n.a.
- [#5] Seaport Topco Limited Annual Report (published Sep-24), Notes to the Financial Statements, Note 16: Fixed asset investments – Indirect subsidiary undertakings (page n.a.) — used to support “Shareholders” n.a. (no parent/ultimate controlling party disclosed).
- [#7] Seaport Topco Limited Annual Report 2024, Independent Auditors’ Report signature block, file: SEAPORT_TOPCO_LIMITED_AA_annualReport_2024-09-25_0.pdf (26 pages), signed Apr-24 — corroborates “Auditors” identification.
"""

# =========================
# 1) Extract CSV + SUMMARY
# =========================
parts = gpt_output.split("\n\nSummary / Interpretation", 1)
csv_block = parts[0].strip()

start = csv_block.find("Metric,")
if start == -1:
    raise ValueError("CSV header 'Metric,' not found in model output.")
csv_block = csv_block[start:]

summary_text = ""
if len(parts) > 1:
    summary_text = "Summary / Interpretation" + parts[1].rstrip()

# =========================
# 2) Parse CSV to DataFrame
# =========================
df = pd.read_csv(io.StringIO(csv_block))
expected_cols = {"Metric", "Shareholders"}
if not expected_cols.issubset(df.columns):
    raise ValueError(f"Key Stakeholders CSV columns missing. Got: {list(df.columns)}")

# Dict: metric -> value (right column)
ks_rows = {
    str(df.at[i, "Metric"]).strip(): str(df.at[i, "Shareholders"]).strip()
    for i in range(len(df))
}

# ==============================================
# 3) Open DOCX, find the "Key Stakeholders" table
# ==============================================
doc_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile (1).docx"
doc = Document(doc_path)

def norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

def find_ks_table(document: Document):
    # Prefer a table whose header has both "Title" and "Occupants"
    for tbl in document.tables:
        if not tbl.rows:
            continue
        header = [norm(c.text) for c in tbl.rows[0].cells]
        if "title" in header and "occupants" in header:
            return tbl

    # Fallback: first table after the "Key Stakeholders" heading
    found_heading = False
    body = document._element.body
    for child in body.iterchildren():
        tag = child.tag.rsplit("}", 1)[-1]
        if tag == "p":
            p_text = "".join(t.text for t in child.iter()
                             if t.tag.rsplit("}",1)[-1] == "t").strip()
            if norm(p_text) == "keystakeholders":
                found_heading = True
        elif tag == "tbl" and found_heading:
            from docx.table import Table
            return Table(child, document)
    return None

ks_table = find_ks_table(doc)
if ks_table is None:
    raise RuntimeError("Could not locate the 'Key Stakeholders' table (Title | Occupants).")

# ==============================================
# 4) Detect columns: label = "Title", value = "Occupants"
# ==============================================
def get_title_and_occupants_cols(tbl):
    # Defaults for a 2-col layout
    label_col, value_col = 0, 1

    if tbl.rows:
        header_norm = [norm(c.text) for c in tbl.rows[0].cells]
        if "title" in header_norm:
            label_col = header_norm.index("title")
        if "occupants" in header_norm:
            value_col = header_norm.index("occupants")

    # Ensure they are different; if not, force value_col to the other col
    if value_col == label_col and len(tbl.rows[0].cells) >= 2:
        value_col = 1 if label_col == 0 else 0
    return label_col, value_col

label_col, value_col = get_title_and_occupants_cols(ks_table)

# ==============================================
# 5) Build row index using the Title (label) column
# ==============================================
row_index = {}
for r_idx, row in enumerate(ks_table.rows):
    if not row.cells:
        continue
    # Skip header
    if r_idx == 0:
        continue
    label_text = row.cells[label_col].text.strip()
    if label_text:
        row_index[norm(label_text)] = r_idx

# ==============================================
# 6) Map CSV metric names → Title labels
# ==============================================
def keynorm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

metric_to_title = {
    keynorm("Shareholders"): "Shareholders",
    keynorm("Management"):  "Management",
    keynorm("Lenders"):     "Lenders",
    keynorm("Auditors"):    "Auditors",
    keynorm("Advisors"):    "Advisors",
}

# ==============================================
# 7) Populate the Occupants column ONLY
# ==============================================
not_found = []
for metric, value in ks_rows.items():
    title_label = metric_to_title.get(keynorm(metric), metric)
    r_idx = row_index.get(norm(title_label))
    if r_idx is None:
        not_found.append(metric)
        continue
    # write into Occupants cell
    ks_table.rows[r_idx].cells[value_col].text = value

if not_found:
    print("WARNING — missing rows for:", ", ".join(not_found))

# ==============================================
# 8) Replace the placeholder with the KS SUMMARY text (optional)
#     (placeholder: [INSERT KEY STAKEHOLDERS SUMMARY])
# ==============================================
KS_PLACEHOLDER = "[INSERT KEY STAKEHOLDERS SUMMARY]"

def set_paragraph_multiline(paragraph, text: str):
    for run in paragraph.runs:
        run.text = ""
    lines = (text or "").splitlines()
    if not lines:
        return
    paragraph.add_run(lines[0])
    for ln in lines[1:]:
        paragraph.add_run().add_break(WD_BREAK.LINE)
        paragraph.add_run(ln)

def replace_placeholder(document: Document, placeholder: str, new_text: str) -> bool:
    # paragraphs
    for p in document.paragraphs:
        if placeholder in p.text:
            set_paragraph_multiline(p, new_text)
            return True
    # cells
    for tbl in document.tables:
        for row in tbl.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    if placeholder in p.text:
                        set_paragraph_multiline(p, new_text)
                        return True
    return False

if summary_text:
    ok = replace_placeholder(doc, KS_PLACEHOLDER, summary_text)
    if not ok:
        print("NOTE: placeholder not found:", KS_PLACEHOLDER)

# ==============================================
# 9) Save
# ==============================================
out_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile2.docx"
doc.save(out_path)
print(f"Updated document written to: {out_path}")


# Financial Performance

In [None]:
import io, re
import pandas as pd
from docx import Document
from docx.enum.text import WD_BREAK

# =========================
# 0) Your full GPT response
# =========================
gpt_output = finance_output

# =========================
# 1) Extract CSV + SUMMARY
# =========================
# Split off the summary block (everything after the blank line + heading)
parts = gpt_output.split("\n\nSummary / Interpretation", 1)
csv_block = parts[0].strip()

# Ensure we start at the CSV header
start = csv_block.find("Metric,")
if start == -1:
    raise ValueError("CSV header 'Metric,' not found in model output.")
csv_block = csv_block[start:]

# Summary text (keep heading + bullets if present)
summary_text = ""
if len(parts) > 1:
    summary_text = "Summary / Interpretation" + parts[1].rstrip()

# =========================
# 2) Parse CSV to DataFrame
# =========================
df = pd.read_csv(io.StringIO(csv_block))
expected_cols = {"Metric","FY24","FY23","FY22"}
if not expected_cols.issubset(df.columns):
    raise ValueError(f"CSV columns missing. Got: {list(df.columns)}")

# Dict: metric -> {FY24, FY23, FY22}
csv_rows = {
    str(df.at[i, "Metric"]).strip(): {
        "FY24": df.at[i, "FY24"],
        "FY23": df.at[i, "FY23"],
        "FY22": df.at[i, "FY22"],
    }
    for i in range(len(df))
}

# ==============================================
# 3) Open DOCX, find the Financial Performance table
# ==============================================
doc_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile (1).docx"
doc = Document(doc_path)

def norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

def find_fin_perf_table(document: Document):
    # Heuristic 1: find a table whose header row contains FY24/FY23/FY22
    for tbl in document.tables:
        if len(tbl.rows):
            header = " ".join(c.text for c in tbl.rows[0].cells)
            if all(x in norm(header) for x in ["fy24","fy23","fy22"]):
                return tbl
    # Heuristic 2: first table after a paragraph exactly "Financial Performance"
    found_heading = False
    body = document._element.body
    for child in body.iterchildren():
        tag = child.tag.rsplit("}", 1)[-1]
        if tag == "p":
            p_text = "".join(t.text for t in child.iter() if t.tag.rsplit("}",1)[-1] == "t").strip()
            if norm(p_text) == "financialperformance":
                found_heading = True
        elif tag == "tbl" and found_heading:
            from docx.table import Table
            return Table(child, document)
    return None

table = find_fin_perf_table(doc)
if table is None:
    raise RuntimeError("Could not locate the 'Financial Performance' table.")

# ==============================================
# 4) Map CSV metric names → rows in the template
# ==============================================
# --- Normalizers (use same rule for CSV and map keys) ---
def keynorm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

# Map CSV metric -> DOC row label (left column text in your template)
# Use normalized keys on the left so variations in spaces/slashes/plus signs won't break it.
metric_map_norm = {
    keynorm("Revenue (Turnover)"): "Revenue",
    keynorm("Revenue growth % (yoy)"): "Revenue Growth",
    keynorm("Gross profit"): "Gross Profit",
    keynorm("Gross margin %"): "Gross Margin",
    keynorm("EBITDA"): "EBITDA",
    keynorm("EBITDA margin %"): "EBITDA Margin",
    keynorm("Adjusted EBITDA"): "Adjusted EBITDA",

    # >>> The ones you said aren't populating <<<
    keynorm("Capex (tangible+intangible)"): "CAPEX",  # DOC label
    keynorm("Capex (tangible + intangible)"): "CAPEX",  # alt form (spaces)
    keynorm("Net Working Capital (change)"): "NET_WORK",
    keynorm("Cash Flow from Financing Activities (net)"): "Cash Flow from Financing Activities",
    keynorm("Net cash from financing activities"): "CASH_FINAN",  # alt wording
    keynorm("Total Debt (external)"): "TOTAL_DEBT",
    keynorm("Total debt (bank + lease liabilities)"): "TOTAL_DEBT",  # alt wording
    keynorm("Leverage (Net Debt/EBITDA)"): "LEVERAGE",
    keynorm("Leverage (Net Debt / EBITDA)"): "LEVERAGE",

    # Other cash flow items you have
    keynorm("Net cash from operating activities"): "Cash Flow from Operating Activities",
    keynorm("Net Working Capital (change)"): "Net Working Capital",  # duplicate on purpose (case variant)
    keynorm("Operating cash flow excl. NWC"): "Cash Flow from Operating Activities excl. Net Working Capital",
    keynorm("Other Cash Flow from Investing Activities"): "Other Cash Flow from Investing Activities",
    keynorm("Net cash from investing activities"): "Net Cash Flow from Investing Activities",
    keynorm("CFADS"): "CFADS",
    keynorm("Opening Cash"): "Opening Cash",
    keynorm("Change in Cash"): "Change in Cash",
    keynorm("Closing Cash"): "Closing Cash",
    keynorm("Bank loans outstanding"): "Total Debt",  # if your template has separate row, adjust
    keynorm("Net Debt"): "Net Debt",
}

# Build lookup of row labels in the DOCX (first column). You already did:
doc_row_index = {}
for r_idx, row in enumerate(table.rows):
    label = row.cells[0].text.strip()
    if label:
        doc_row_index[norm(label)] = r_idx  # norm = your doc normalizer (same idea as keynorm)

# Identify FY columns in header row (handles "FY 24" vs "FY24")
header_norm = [norm(c.text) for c in table.rows[0].cells]
try:
    col_FY24 = header_norm.index("fy24")
    col_FY23 = header_norm.index("fy23")
    col_FY22 = header_norm.index("fy22")
except ValueError:
    # If header is the second row in your template, try that
    header_norm = [norm(c.text) for c in table.rows[1].cells]
    col_FY24 = header_norm.index("fy24")
    col_FY23 = header_norm.index("fy23")
    col_FY22 = header_norm.index("fy22")


# Populate table
not_found = []

for csv_metric, years in csv_rows.items():
    # Find the DOC row label using normalized CSV metric text
    target_label = metric_map_norm.get(keynorm(csv_metric), csv_metric)  # fall back to same text
    r_idx = doc_row_index.get(norm(target_label))  # norm() is your existing doc normalizer
    if r_idx is None:
        not_found.append(csv_metric)
        continue

    row = table.rows[r_idx]
    row.cells[col_FY24].text = str(years["FY24"])
    row.cells[col_FY23].text = str(years["FY23"])
    row.cells[col_FY22].text = str(years["FY22"])

# ==============================================
# 5) Populate the table from CSV
# ==============================================
not_found = []

for csv_metric, years in csv_rows.items():
    target_label = metric_map_norm.get(csv_metric, csv_metric)
    r_idx = doc_row_index.get(norm(target_label))
    if r_idx is None:
        not_found.append(csv_metric)
        continue

    row = table.rows[r_idx]
    row.cells[col_FY24].text = str(years["FY24"])
    row.cells[col_FY23].text = str(years["FY23"])
    row.cells[col_FY22].text = str(years["FY22"])

# ==============================================
# 6) Replace the placeholder with the SUMMARY text
#     (placeholder: [INSERT FINANCIAL PERFORMANCE SUMMARY])
# ==============================================
PLACEHOLDER = "[INSERT FINANCIAL PERFORMANCE SUMMARY]"

def set_paragraph_multiline(paragraph, text: str):
    # clear runs
    for run in paragraph.runs:
        run.text = ""
    # add lines with explicit line breaks
    lines = text.splitlines()
    if not lines:
        return
    paragraph.add_run(lines[0])
    for ln in lines[1:]:
        paragraph.add_run().add_break(WD_BREAK.LINE)
        paragraph.add_run(ln)

def replace_placeholder(document: Document, placeholder: str, new_text: str) -> bool:
    # search in paragraphs
    for p in document.paragraphs:
        if placeholder in p.text:
            set_paragraph_multiline(p, new_text)
            return True
    # search inside tables (cells contain their own paragraphs)
    for tbl in document.tables:
        for row in tbl.rows:
            for cell in row.cells:
                for p in cell.paragraphs:
                    if placeholder in p.text:
                        set_paragraph_multiline(p, new_text)
                        return True
    return False

if summary_text:
    ok = replace_placeholder(doc, PLACEHOLDER, summary_text)
    if not ok:
        print("WARNING: placeholder not found:", PLACEHOLDER)
else:
    print("NOTE: No summary text found in GPT output (no 'Summary / Interpretation' section).")

# ==============================================
# 7) Save
# ==============================================
out_path = "/Users/felipesilverio/Documents/GitHub/Azure-OnePager/CompanyProfile2.docx"
doc.save(out_path)
print(f"Updated document written to: {out_path}")

if not_found:
    print("WARNING — CSV metrics not matched to any row:")
    for m in not_found:
        print(" -", m)
