In [1]:
path = "formtest.pdf"

In [2]:
from lxml import html
import warnings
from pydantic import BaseModel
from typing import Any, Optional
from unstructured.partition.pdf import partition_pdf
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.modeling_utils")

raw_pdf_elements = partition_pdf(filename=path,
                                 extract_images_in_pdf=False,
                                 infer_table_structure=True,
                                 chunking_strategy="by_title",
                                 max_characters=4000,
                                 new_after_n_chars=3800,
                                 combine_text_under_n_chars=2000,
                                 )

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 4,
 "<class 'unstructured.documents.elements.Table'>": 3}

In [4]:
class Element(BaseModel):
    type: str
    text: Any

categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))


table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

3
4


In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

In [6]:
import openai
import os
os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [7]:
from langchain_openai import ChatOpenAI
warnings.filterwarnings("ignore", category=UserWarning, module="LangChainDeprecationWarning")

prompt_text="""You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

model = ChatOpenAI(temperature=0,model="gpt-3.5-turbo")

summarize_chain = {"element": lambda x:x} | prompt | model | StrOutputParser()

In [8]:
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [9]:
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [10]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

  warn_deprecated(


In [11]:
from langchain.schema.runnable import RunnablePassthrough

template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0,model="gpt-4")

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [12]:
chain.invoke("What's the name of the passenger?")

'The name of the passenger is Vidhyuth Kasthurirangan.'

In [13]:
chain.invoke("What are the conditions for cancellation policy for timing From 08/04/2024 06:45 PM to 08/04/2024 10:45 PM?")

'The conditions for cancellation policy for timing From 08/04/2024 06:45 PM to 08/04/2024 10:45 PM are a deduction of Rs.997.50 and a refund of Rs.-47.50. The charges are 100% plus a 5% tax.'

In [14]:
chain.invoke("If a man impersonates as a woman and takes seat, what are the consequences?")

'The man will be made to get off the bus without any consideration and no refund will be given.'

In [15]:
chain.invoke("Give me all the details of the passenger?")

"The passenger's name is Vidhyuth Kasthurirangan. He is a 20 year old male. His seat number is U7 and his status is booked. He paid a fare of 950.00. His PNR Number is KMRO47N4B742 and he is traveling on 08/04/2024 from Chennai to Bangalore. The bus type is 2+1 A/C Seater/Sleeper. His mobile number is 9591846600. He is supposed to report at 09:50 PM and his boarding time is 08/04/2024 9:40 PM. He booked his ticket via Redbus. His boarding address is the bus stop shed entrance towards Pallavaram, Chromepet, Chennai and he will be alighting in front of Kalamandir, Marathahalli, Bangalore."