# Preprocessing

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# Extracting the data 

from unstructured.partition.pdf import partition_pdf

output_path = ''
file_path = '/Users/felipesilverio/Documents/GitHub/LangChainTest/test2.pdf'

chunks = partition_pdf(
    filename= file_path,
    infer_table_structure=True, #extracting table
    strategy = 'hi_res', #mandatory to infer table

    extract_image_block_types=['Image'], #add 'Table' to list to extract image of tables
    # image_output_dir_path = output_path, #if None, images and tables will be saved as base64

    extract_image_block_to_payload=True, #if true, extract base64 for API usage

    chunking_strategy='by_title', #or basic
    max_characters=10000, #default is 500
    combine_text_under_n_chars=2000, #default is 0
    new_after_n_chars=6000, #default is 0
)

chunks

  from .autonotebook import tqdm as notebook_tqdm
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


[<unstructured.documents.elements.CompositeElement at 0x307a772e0>,
 <unstructured.documents.elements.CompositeElement at 0x307a77430>,
 <unstructured.documents.elements.CompositeElement at 0x307a77700>,
 <unstructured.documents.elements.CompositeElement at 0x307a77190>,
 <unstructured.documents.elements.CompositeElement at 0x307a77730>,
 <unstructured.documents.elements.CompositeElement at 0x307a770a0>,
 <unstructured.documents.elements.CompositeElement at 0x307a776a0>,
 <unstructured.documents.elements.CompositeElement at 0x307a77790>,
 <unstructured.documents.elements.CompositeElement at 0x307a77490>,
 <unstructured.documents.elements.CompositeElement at 0x307a77a30>,
 <unstructured.documents.elements.CompositeElement at 0x307a775e0>,
 <unstructured.documents.elements.CompositeElement at 0x307a77ac0>,
 <unstructured.documents.elements.CompositeElement at 0x307a779d0>,
 <unstructured.documents.elements.CompositeElement at 0x307a77d30>,
 <unstructured.documents.elements.CompositeEleme

In [6]:
tables, texts, images= [], [], []

for chunk in chunks:
    if "CompositeElement" in str(type(chunk)):
        chunk_els = chunk.metadata.orig_elements
        for el in chunk_els:
            if "Table" in str(type(el)):
                tables.append(el)
            elif "Image" in str(type(el)):
                images.append(el)
            else:
                texts.append(el)

In [7]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt_text = """
You are an especialist in corporate finance tasked with summarizing the text, tables and images.
Give a concise summary of the table, text or image.
The tables will be received in format html. Transform this format in order to interpret the table.

The summary must take special attention to financial-related numbers and statistics, such as monthly or yearly comparisons, debt/loan information, and other subjects related.
The summary must contain the numerical information of debt, loan, revenue, deficit, and other related topics.
Always mention in the summary from which of the blocks the content being summarized is part of (Introduction Table, Business Overview, Revenue Split, Key Stakeholders Table, Financial Highlights, Capital Structure)
Response only with the summary, no additional comment. 
Do not start your message by saying "Here is a summary" or anything like that. 
Just give the summart as it is.

Table or text chunk of Tour Partner Groups: {element}
"""

prompt = ChatPromptTemplate.from_template(prompt_text)

model = ChatOpenAI(model="gpt-4o-mini", temperature=0.2,)
# model = ChatGroq(temperature=0.5, model='llama-3.1-8b-instant')
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [8]:
text_summaries = summarize_chain.batch(texts, {'max_concurrency':3})

table_summaries = summarize_chain.batch(tables, {'max_concurrency':3})


In [9]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore = Chroma(collection_name='multi_modal_rag', embedding_function=OpenAIEmbeddings())

#Storage layor
store = InMemoryStore()
id_key = 'doc_id'

#retriever

dense_retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
    search_kwargs={"k": 30},          # top-40 neighbours
)

In [35]:
#Loading values

doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
dense_retriever.vectorstore.add_documents(summary_texts)
dense_retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
tables_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={id_key: tables_ids[i]}) for i, summary in enumerate(table_summaries)
]
dense_retriever.vectorstore.add_documents(summary_tables)
dense_retriever.docstore.mset(list(zip(tables_ids, tables)))


In [36]:
from langchain.retrievers import EnsembleRetriever
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
from base64 import b64decode
from prompts import system_finance_prompt

# lexical (sparse) – BM25 over the same documents
keys = list(store.yield_keys())      # ['id1', 'id2', ...]
raw_items = store.mget(list(store.yield_keys()))

all_docs = [
    item if isinstance(item, Document)
    else Document(page_content=getattr(item, "text", str(item)))
    for item in raw_items
]

bm25 = BM25Retriever.from_documents(all_docs)
bm25.k = 20                                            # top-20 keyword hits

# ------------------------------------------------------------------ #
# 2.  combine them                                                   #
# ------------------------------------------------------------------ #
from langchain.retrievers import EnsembleRetriever

hybrid = EnsembleRetriever(
    retrievers=[bm25, dense_retriever],
    weights=[0.4, 0.6],               # adjust to bias dense vs. sparse
)


# RAG

In [54]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
from base64 import b64decode
from prompts import system_finance_prompt
from base64 import b64decode, b64decode as _b64
import re

# ── helper that treats *any* object (str • Document • unstructured element) ──
def _to_str(obj) -> str:
    if isinstance(obj, str):
        return obj
    if hasattr(obj, "page_content"):
        return obj.page_content  # LangChain Document
    if hasattr(obj, "text"):
        return obj.text          # unstructured element
    return str(obj)

# Quick base‑64 heuristic (≥40 chars & charset check)
_B64_RE = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")

def _looks_like_b64(s: str) -> bool:
    if len(s) < 40 or not _B64_RE.fullmatch(s.strip()):
        return False
    try:
        _b64(s, validate=True)
        return True
    except Exception:
        return False

# ---------------------------------------------------------------------------
# 1 · Parse retrieved docs into images + text strings
# ---------------------------------------------------------------------------

def parse_docs(docs):
    images, texts = [], []
    for d in docs:
        payload = _to_str(d)
        if _looks_like_b64(payload):
            images.append(payload)
        else:
            texts.append(payload)
    return {"images": images, "texts": texts}

# ---------------------------------------------------------------------------
# 2 · Build prompt (OpenAI vision-style content list)
# ---------------------------------------------------------------------------

def build_prompt_two(kwargs) -> ChatPromptTemplate:
    ctx      = kwargs["context"]
    question = kwargs["question"]

    parts = [
        {
            "type": "text",
            "text": f"Context:{ctx.get('texts', [])}, Question: {question}",
        }
    ]

    for b64 in ctx.get("images", []):
        parts.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
        })

    messages = [
        SystemMessage(content=system_finance_prompt),
        HumanMessage(content=parts),
    ]
    return ChatPromptTemplate.from_messages(messages)

chain = (
    {
        'context': hybrid | RunnableLambda(parse_docs),
        'question': RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt_two)
    | ChatOpenAI(model='o3')
    | StrOutputParser()
)

chain_with_sources = {
    'context': hybrid | RunnableLambda(parse_docs),
    'question': RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt_two)
        | ChatOpenAI(model='o3')
        | StrOutputParser()
    )
)

In [40]:
# 1. pull every key currently in the store
keys = list(store.yield_keys())                 # ['uuid1', 'uuid2', …]

# 2. map any raw element → Document(page_content=str)
fixed_pairs = []
for k in keys:
    val = store.mget([k])[0]                    # fetch single object
    if isinstance(val, Document):
        fixed_pairs.append((k, val))            # already okay
    else:                                       # Title / NarrativeText / Table
        fixed_pairs.append(
            (k, Document(page_content=getattr(val, "text", str(val))))
        )

# 3. overwrite the store with the cleaned documents
store.mset(fixed_pairs)


In [38]:
print("Total embeddings in collection:", len(vectorstore._collection.get()["ids"]))


Total embeddings in collection: 1509


In [71]:
vectorstore._collection.get(include=["embeddings"])["embeddings"][0][:5]

array([-0.01645603, -0.0205357 ,  0.01218406, -0.01418955, -0.02765108])

In [41]:
docs = hybrid.get_relevant_documents("Tour Partner Group Limited")
print(len(docs), "docs retrieveda")
for d in docs[:3]:
    print(d.page_content[:120], "…")

12 docs retrieveda
TOUR PARTNER GROUP LIMITED …
Tour Partner Group International Ltd * …
The principal activity of the Company remains that of an intermediate holding company for its investments. The principal …


In [46]:
# my_prompt = 'Make the company profile of Tour Partner Group Limited. Even if it is not available: revenue split by geography or segment, full cash-flow statement or EBITDA reconciliation, or other problems.'
my_prompt = """ Make the company profile of Tour Partner Group Limited. """
profile = chain_with_sources.invoke(my_prompt)
print(profile)

{'context': {'images': [], 'texts': ['TOUR PARTNER GROUP LIMITED', 'Tour Partner Group UK Ltd', 'During the year the company had net cash movements amounting to €4,244,756 (2022: €394,897) with Tour Partner Group Holdco Limited, a parent company established in Guernsey. At the year end the company owed €4,830,612 (2022: €585,856) to Tour Partner Group Holdco Limited', 'Tour Partner Group Nordics ApS**', 'Tour Partner Group International Ltd *', '* owned 100% by Tour Partner Group UK Ltd', 'Tour Partner Group Limited is a private company limited by shares incorporated in England and Wales. The address of its registered office is given in the company information page of these financial statements.', 'The largest and smallest group producing publicly available consolidated financial statements is headed by Tour Partner Group Midco Limited. These financial statements are available upon request from Companies House, Crown Way, Cardiff, CF14 3VZ.', 'Tour Partner ApS*', 'The directors of Tour

In [47]:
print(profile['response'])

Tour Partner Group Limited – Preliminary Restructuring Screen  
(All monetary figures in €, unless stated; Sources hyper-linked where possible)

INTRODUCTION (Company Snapshot)
• Primary Industry …………………  Travel & Tourism  
• Incorporation Year ………………  1999 (Companies House filing)  
• Headquarters ……………………  London, UK (Companies House filing)  
• Employees (FY23) ………………  101  (Tour Partner Group Midco Ltd FY23 AR, p. 24)  
Operational KPIs (FY23, Midco consolidated)  
• Group tours operated ……… 5,200  
• Travellers served ……………… 150,000  
• Source markets ………………… 40+  

BUSINESS OVERVIEW  (bullets)
- Tour Partner Group delivers inbound, B2B-only group travel programmes into the UK, Ireland, the Nordics and Continental Europe, acting as a destination management company (“DMC”).  
- The company assembles multi-day, custom itineraries for tour operators, wholesalers and online travel agents, providing contracting, operations and on-the-ground services.  
- It leverages long-standing supp

In [56]:
# my_prompt = 'Make the company profile of Tour Partner Group Limited. Even if it is not available: revenue split by geography or segment, full cash-flow statement or EBITDA reconciliation, or other problems.'
my_prompt = """ Write a summary that includes all financial information in a detailed manner related to Facility A and Facility B """
profile = chain_with_sources.invoke(my_prompt)
print(profile['response'])

Detailed Summary of Term Loan Facilities A & B  
(Source: FY23 Annual Report – Note “Term loan facilities A & B”; Directors’ Report; Risk Management Note)

Facility A  
• Principal outstanding at 31-Dec-23: £2.375 m (€2.739 m) (31-Dec-22: £2.375 m / €2.681 m).  
• Interest: SONIA + 3.5 % per annum, cash-payable quarterly.  
• Original amortisation schedule (agreed during pandemic amendment): first repayment of €850 k (c.£0.7 m) due Dec-24, with the remaining balance due Jun-25.  
• Post-year-end rescheduling (agreed Feb-24): £0.425 m due Feb-25, £0.425 m due Mar-25, balance repayable 30-Jun-25.  
• Issue costs: total €741 k (£617.5 k) deducted from initial carrying value; un-amortised balance at 31-Dec-23 €129.6 k (31-Dec-22 €217.9 k). These costs are amortised through the Statement of Comprehensive Income using the effective-interest method.

Facility B  
• Principal outstanding at 31-Dec-23: £8.800 m (€10.149 m) (31-Dec-22: £8.800 m / €9.934 m).  
• Interest: SONIA + 4.0 % per annum,

# PDF and PPT

In [48]:
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from pptx import Presentation
from pptx.util import Inches
from PyPDF2 import PdfReader


In [49]:
def text_to_pdf_unicode(text: str, output_path: str):
    """
    Creates a PDF file from plain text with full Unicode support
    using ReportLab's Platypus framework.
    
    Parameters:
    - text: the string content to write into the PDF.
    - output_path: full file path where the PDF will be saved.
    """
    # 1. Prepare the document
    doc = SimpleDocTemplate(output_path, pagesize=letter)
    styles = getSampleStyleSheet()
    body_style = styles['BodyText']
    
    # 2. Build a "story" of flowable objects
    story = []
    for line in text.split('\n'):
        # Paragraph handles Unicode (e.g. “–”, “é”, emojis, etc.) natively
        story.append(Paragraph(line or ' ', body_style))
        # Small spacer between lines
        story.append(Spacer(1, 4))
    
    # 3. Generate the PDF
    doc.build(story)

profile_text = profile["response"]  # fetched from your chain output
text_to_pdf_unicode(profile_text, "/Users/felipesilverio/Documents/GitHub/LangChainTest/output/company_profile_unicode5.pdf")


In [36]:
from pptx.util import Pt
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
from typing import Any, Dict, List, Optional

def replace_placeholder_text(file_path: str,placeholder: str,replacement: str,output_path: Optional[str] = None) -> None:
    """
    Replace occurrences of `placeholder` in text elements with `replacement`.

    Modifies the presentation and saves to `output_path` or overwrites original.
    """
    try:
        prs = Presentation(file_path)
    except Exception as e:
        raise IOError(f"Unable to open file {file_path}: {e}")

    for slide in prs.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    if placeholder in run.text:
                        run.text = run.text.replace(placeholder, replacement)
                        run.font.size = Pt(5)

    save_path = output_path or file_path
    try:
        prs.save(save_path)
    except Exception as e:
        raise IOError(f"Unable to save updated file to {save_path}: {e}")


In [39]:
from PyPDF2 import PdfReader

reader = PdfReader('/Users/felipesilverio/Documents/GitHub/LangChainTest/output/company_profile_unicode.pdf')
reader

all_text = []
for page_num, page in enumerate(reader.pages, start=1):
    try:
        text = page.extract_text() or ''
    except Exception:
        text = ''
    all_text.append(text)

pdf_retrieved = "\n".join(all_text)
pdf_retrieved

'Company Profile – Tour Partner Group Limited\nSources utilised:\n(1) Companies House – “Tour Partner Group Limited, Annual Report & Financial Statements for the\nyear-ended 31-Dec-22”, filed 29-Sep-23 (pdf)\n(2) Tour Partner Group website – “About Us” page, accessed 07-Jun-24\n--------------------------------------------------\n1. Company Snapshot\nPrimary Industry: Travel & Tourism\nIncorporation Year: 2016\nHeadquarters: London, United Kingdom\nEmployees: 230 (FY22, Companies House filing, note 6)\nOperational KPIs (FY22)\n\x7f Passengers handled: 350,000\n\x7f Bed-nights booked: 710,000\n\x7f Source markets served: 70+\n(Sources: (1) Directors’ report & strategic review; (2) corporate website)\n--------------------------------------------------\n2. Business Overview (bullets only)\n\x7f Tour Partner Group Limited is an intermediate holding company that consolidates a portfolio of B2B\ndestination management and group travel brands serving the UK & Ireland, the Nordics and\nContinen

In [47]:
ppt_path = '/Users/felipesilverio/Documents/GitHub/LangChainTest/try2.pptx'
my_prompt = f"""
The following text contains a series of text blocks that are separated by multiples - like:
--------------------------------------------------------------------------------------------------------------------

Bring me back only the code block relative to 1. Company Snapshot, and give me all the information in it in a single line but separated by |

The text is:

{pdf_retrieved}
"""
response = chain_with_sources.invoke(my_prompt)
replace_placeholder_text('/Users/felipesilverio/Documents/GitHub/LangChainTest/backupppt.pptx', 'Company Snapshot', response['response'], ppt_path)

my_prompt = f"""
The following text contains a series of text blocks that are separated by multiples - like:
--------------------------------------------------------------------------------------------------------------------

Bring me back only the code block relative to Business Overview, and give me back exactly as is in the text.

The text is:

{pdf_retrieved}
"""
response = chain_with_sources.invoke(my_prompt)
replace_placeholder_text('/Users/felipesilverio/Documents/GitHub/LangChainTest/backupppt.pptx', 'Business Overview Text', response['response'], ppt_path)
