# Preprocessing

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# Extracting the data 

from unstructured.partition.pdf import partition_pdf

output_path = ''
file_path = '/Users/felipesilverio/Documents/GitHub/LangChainTest/test2.pdf'

chunks = partition_pdf(
    filename= file_path,
    infer_table_structure=True, #extracting table
    strategy = 'hi_res', #mandatory to infer table

    extract_image_block_types=['Image'], #add 'Table' to list to extract image of tables
    # image_output_dir_path = output_path, #if None, images and tables will be saved as base64

    extract_image_block_to_payload=True, #if true, extract base64 for API usage

    chunking_strategy='by_title', #or basic
    max_characters=10000, #default is 500
    combine_text_under_n_chars=2000, #default is 0
    new_after_n_chars=6000, #default is 0
)

chunks

  from .autonotebook import tqdm as notebook_tqdm
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


[<unstructured.documents.elements.CompositeElement at 0x30d3ef250>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef550>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef760>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef280>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef700>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef880>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef850>,
 <unstructured.documents.elements.CompositeElement at 0x30d3efac0>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef820>,
 <unstructured.documents.elements.CompositeElement at 0x30d3efa90>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef130>,
 <unstructured.documents.elements.CompositeElement at 0x30d3ef9a0>,
 <unstructured.documents.elements.CompositeElement at 0x30d3efbb0>,
 <unstructured.documents.elements.CompositeElement at 0x30d3efd90>,
 <unstructured.documents.elements.CompositeEleme

In [3]:
tables, texts, images= [], [], []

for chunk in chunks:
    if "CompositeElement" in str(type(chunk)):
        chunk_els = chunk.metadata.orig_elements
        for el in chunk_els:
            if "Table" in str(type(el)):
                tables.append(el)
            elif "Image" in str(type(el)):
                images.append(el)
            else:
                texts.append(el)

In [37]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt_text = """
You are an especialist in corporate finance tasked with summarizing the text, tables and images.
Give a concise summary of the table, text or image.
The tables will be received in format html. Transform this format in order to interpret the table.

The summary must take special attention to financial-related numbers and statistics, such as monthly or yearly comparisons, debt/loan information, and other subjects related.
The summary must contain the numerical information of debt, loan, revenue, deficit, and other related topics.
Always mention in the summary from which of the blocks the content being summarized is part of (Introduction Table, Business Overview, Revenue Split, Key Stakeholders Table, Financial Highlights, Capital Structure)
Response only with the summary, no additional comment. 
Do not start your message by saying "Here is a summary" or anything like that. 
Just give the summart as it is.

Table or text chunk of Tour Partner Groups: {element}
"""

prompt = ChatPromptTemplate.from_template(prompt_text)

model = ChatOpenAI(model="gpt-4o-mini", temperature=0.2,)
# model = ChatGroq(temperature=0.5, model='llama-3.1-8b-instant')
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [5]:
text_summaries = summarize_chain.batch(texts, {'max_concurrency':3})


In [6]:
table_summaries = summarize_chain.batch(tables, {'max_concurrency':3})


In [47]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore = Chroma(collection_name='multi_modal_rag', embedding_function=OpenAIEmbeddings())

#Storage layor
store = InMemoryStore()
id_key = 'doc_id'

#retriever

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
    search_kwargs={"k":30}
)

In [48]:
#Loading values

doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
tables_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={id_key: tables_ids[i]}) for i, summary in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(tables_ids, tables)))

# RAG

In [49]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
from base64 import b64decode
from prompts import system_finance_prompt

def parse_docs(docs):
    # Split base64 images and texts
    b64 = []
    text = []
    for doc in docs:
        try:
            b64decode(doc)
            b64.append(doc)
        except Exception as e:
            text.append(doc)
    return{'images':b64, 'texts':text}


def build_prompt_two(kwargs) -> ChatPromptTemplate:
    """
    Construct a ChatPromptTemplate that always begins with the system prompt,
    then includes context (text + images) and the user question.
    """

    context = kwargs['context']
    question = kwargs['question']

    # Concatenate all text fragments
    context_text = "".join([t.text for t in context.get('texts', [])])

    # Build the messages list: SystemMessage -> HumanMessage
    messages = [
        SystemMessage(content=system_finance_prompt),
        HumanMessage(content=f"Context: {context_text}\nQuestion: {question}")
    ]

    # Include images if present
    for b64 in context.get('images', []):
        messages.append(
            HumanMessage(content={
                'type': 'image_url',
                'image_url': {'url': f'data:image/jpeg;base64,{b64}'},
            })
        )

    # Create and return a prompt template from these messages
    return ChatPromptTemplate.from_messages(messages)


chain = (
    {
        'context': retriever | RunnableLambda(parse_docs),
        'question': RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt_two)
    | ChatOpenAI(model='o3')
    | StrOutputParser()
)

chain_with_sources = {
    'context': retriever | RunnableLambda(parse_docs),
    'question': RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt_two)
        | ChatOpenAI(model='o3')
        | StrOutputParser()
    )
)

In [18]:
docs = retriever.get_relevant_documents("shareholders")
print(len(docs), "docs retrieved")
for d in docs[:3]:
    print(d.text[:120], "…")

4 docs retrieved
During the year and subsequently we have had regular engagement with the shareholders, including liquidity and financing …
As owners of our Group, we rely on the support of shareholders and their opinions are important to us. We have an open d …
Shareholders …


In [52]:
# my_prompt = 'Make the company profile of Tour Partner Group Limited. Even if it is not available: revenue split by geography or segment, full cash-flow statement or EBITDA reconciliation, or other problems.'
# my_prompt = 'Make the company profile out of the context given'
my_prompt = 'Make the company profile of Tour Partner Group Limited. Use the context given'
profile = chain_with_sources.invoke(my_prompt)
print(profile)

{'context': {'images': [], 'texts': [<unstructured.documents.elements.Header object at 0x319257d30>, <unstructured.documents.elements.NarrativeText object at 0x319281f40>, <unstructured.documents.elements.Header object at 0x319242910>, <unstructured.documents.elements.ListItem object at 0x319284520>, <unstructured.documents.elements.Header object at 0x319260340>, <unstructured.documents.elements.Header object at 0x319215ac0>, <unstructured.documents.elements.Header object at 0x319252880>, <unstructured.documents.elements.Title object at 0x34eb2a5e0>, <unstructured.documents.elements.NarrativeText object at 0x319281b80>, <unstructured.documents.elements.Title object at 0x34ea89ca0>]}, 'question': 'Make the company profile of Tour Partner Group Limited. Use the context given', 'response': 'Tour Partner Group Limited  \n(Company No. 05543287, U.K.)\n\nSources used throughout:  \n• Tour Partner Group Ltd – Annual Report & Financial Statements, year-ended 31-Dec-22 (filed at Companies House

In [53]:
print(profile['response'])

Tour Partner Group Limited  
(Company No. 05543287, U.K.)

Sources used throughout:  
• Tour Partner Group Ltd – Annual Report & Financial Statements, year-ended 31-Dec-22 (filed at Companies House 28-Sep-23)  
• Tour Partner Group Ltd – Annual Report & Financial Statements, year-ended 31-Dec-21 (filed 30-Sep-22)  
• Companies House – Charges Register, retrieved 09-Jun-24  
• Company website “About Us” page, accessed 09-Jun-24  

————————————————————————————————————————  
1. Introduction Table (Company Snapshot)

Primary industry        : Travel services  
Incorporation year      : 2005  
Headquarters            : London, UK  
Employees (FY22)        : 376  

Operational KPIs (FY22)  
• Passengers handled                 : >130,000  
• Source markets served              : 65 countries  
• Operating offices                  : 9 (hubs in UK, Ireland, Germany, Nordics)  
[Source: FY22 Annual Report, pp.3-4]  

————————————————————————————————————————  
2. Business Overview (bullets only)


# PDF and PPT

In [23]:
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from pptx import Presentation
from pptx.util import Inches
from PyPDF2 import PdfReader


In [54]:
def text_to_pdf_unicode(text: str, output_path: str):
    """
    Creates a PDF file from plain text with full Unicode support
    using ReportLab's Platypus framework.
    
    Parameters:
    - text: the string content to write into the PDF.
    - output_path: full file path where the PDF will be saved.
    """
    # 1. Prepare the document
    doc = SimpleDocTemplate(output_path, pagesize=letter)
    styles = getSampleStyleSheet()
    body_style = styles['BodyText']
    
    # 2. Build a "story" of flowable objects
    story = []
    for line in text.split('\n'):
        # Paragraph handles Unicode (e.g. “–”, “é”, emojis, etc.) natively
        story.append(Paragraph(line or ' ', body_style))
        # Small spacer between lines
        story.append(Spacer(1, 4))
    
    # 3. Generate the PDF
    doc.build(story)

profile_text = profile["response"]  # fetched from your chain output
text_to_pdf_unicode(profile_text, "/Users/felipesilverio/Documents/GitHub/LangChainTest/output/company_profile_unicode4.pdf")


In [None]:
from pptx.util import Pt
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
from typing import Any, Dict, List, Optional

def replace_placeholder_text(file_path: str,placeholder: str,replacement: str,output_path: Optional[str] = None) -> None:
    """
    Replace occurrences of `placeholder` in text elements with `replacement`.

    Modifies the presentation and saves to `output_path` or overwrites original.
    """
    try:
        prs = Presentation(file_path)
    except Exception as e:
        raise IOError(f"Unable to open file {file_path}: {e}")

    for slide in prs.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    if placeholder in run.text:
                        run.text = run.text.replace(placeholder, replacement)
                        run.font.size = Pt(5)

    save_path = output_path or file_path
    
    try:
        prs.save(save_path)
    except Exception as e:
        raise IOError(f"Unable to save updated file to {save_path}: {e}")


In [39]:
from PyPDF2 import PdfReader

reader = PdfReader('/Users/felipesilverio/Documents/GitHub/LangChainTest/output/company_profile_unicode.pdf')
reader

all_text = []
for page_num, page in enumerate(reader.pages, start=1):
    try:
        text = page.extract_text() or ''
    except Exception:
        text = ''
    all_text.append(text)

pdf_retrieved = "\n".join(all_text)
pdf_retrieved

'Company Profile – Tour Partner Group Limited\nSources utilised:\n(1) Companies House – “Tour Partner Group Limited, Annual Report & Financial Statements for the\nyear-ended 31-Dec-22”, filed 29-Sep-23 (pdf)\n(2) Tour Partner Group website – “About Us” page, accessed 07-Jun-24\n--------------------------------------------------\n1. Company Snapshot\nPrimary Industry: Travel & Tourism\nIncorporation Year: 2016\nHeadquarters: London, United Kingdom\nEmployees: 230 (FY22, Companies House filing, note 6)\nOperational KPIs (FY22)\n\x7f Passengers handled: 350,000\n\x7f Bed-nights booked: 710,000\n\x7f Source markets served: 70+\n(Sources: (1) Directors’ report & strategic review; (2) corporate website)\n--------------------------------------------------\n2. Business Overview (bullets only)\n\x7f Tour Partner Group Limited is an intermediate holding company that consolidates a portfolio of B2B\ndestination management and group travel brands serving the UK & Ireland, the Nordics and\nContinen

In [47]:
ppt_path = '/Users/felipesilverio/Documents/GitHub/LangChainTest/try2.pptx'
my_prompt = f"""
The following text contains a series of text blocks that are separated by multiples - like:
--------------------------------------------------------------------------------------------------------------------

Bring me back only the code block relative to 1. Company Snapshot, and give me all the information in it in a single line but separated by |

The text is:

{pdf_retrieved}
"""
response = chain_with_sources.invoke(my_prompt)
replace_placeholder_text('/Users/felipesilverio/Documents/GitHub/LangChainTest/backupppt.pptx', 'Company Snapshot', response['response'], ppt_path)

my_prompt = f"""
The following text contains a series of text blocks that are separated by multiples - like:
--------------------------------------------------------------------------------------------------------------------

Bring me back only the code block relative to Business Overview, and give me back exactly as is in the text.

The text is:

{pdf_retrieved}
"""
response = chain_with_sources.invoke(my_prompt)
replace_placeholder_text('/Users/felipesilverio/Documents/GitHub/LangChainTest/backupppt.pptx', 'Business Overview Text', response['response'], ppt_path)
