##DEPENDENCY

In [None]:
%pip install pdfplumber
%pip install groq
%pip install --upgrade tiktoken
%pip install -qU langchain-text-splitters

##TABLE EXTRACTION

In [307]:
extracted_text = []
extracted_tables = []

with pdfplumber.open("Syndicated Loan.pdf") as pdf:
    for pg_number in range(len(pdf.pages)):
        page = pdf.pages[pg_number]
        tables = page.find_tables()
        table_bboxes = [i.bbox for i in tables]
        tables = [{'table': i.extract(), 'doctop': i.bbox[1]} for i in tables]
        non_table_words = [word for word in page.extract_words() if not any(
            [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
        
        if len(tables) > 0 and len(non_table_words) > 0:
            extracted_tables = tables
            extracted_text = non_table_words  
        elif len(tables) > 0:
            extracted_tables = tables
        elif len(non_table_words) > 0:
            extracted_text.append(' '.join([i['text'] for i in non_table_words]))    
            
def check_bboxes(word, table_bbox):
    """
    Check whether word is inside a table bbox.
    """
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]


##SCHEMA

In [308]:
# Update the required_schema variable to include the "bank_name" property outside the "items" object
required_schema = {
    "type": "object",
    "properties": {
        "bank_name": {
            "type": "string",
            "description": "Name of the company to which pdf belong to"
        },
        "data": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
                        "description": "Name of the lender"
                    },
                    "term_loan_commitment": {
                        "type": "string",
                        "description": "A term loan commitment is a lender's obligation to contribute their portion of the term loan to the borrower"
                    },
                    "total_commitment": {
                        "type": "string",
                        "description": "Total commitment of the lender to the project"
                    },
                    "percentage": {
                        "type": "string",
                        "description": "Percentage of the total commitment of the lender to the project"
                    },
                    "currency": {
                        "type": "string",
                        "description": "Currency of the commitment amount"
                    }
                },
                "required": [
                    "name",
                    "term_loan_commitment",
                    "total_commitment",
                    "percentage",
                    "currency"
                ]
            }
        }
    },
    "required": ["bank_name", "data"]
}


##TOKEN SIZE CALCULATOR

In [309]:
import tiktoken

def encoding_getter(encoding_type: str):
    """
    Returns the appropriate encoding based on the given encoding type (either an encoding string or a model name).
    """
    if "k_base" in encoding_type:
        return tiktoken.get_encoding(encoding_type)
    else:
        return tiktoken.encoding_for_model(encoding_type)

def tokenizer(string: str, encoding_type: str) -> list:
    """
    Returns the tokens in a text string using the specified encoding.
    """
    encoding = encoding_getter(encoding_type)
    tokens = encoding.encode(string)
    return tokens

def token_counter(string: str, encoding_type: str) -> int:
    """
    Returns the number of tokens in a text string using the specified encoding.
    """
    num_tokens = len(tokenizer(string, encoding_type))
    return num_tokens

In [310]:
table_token_size = token_counter(str(extracted_tables), "gpt-3.5-turbo")
text_token_size = token_counter(str(extracted_text), "gpt-3.5-turbo")
total_pdf_token_size = table_token_size + text_token_size

In [311]:
total_token_len = 4092
schema_len = len(str(required_schema))
total_allowed_len = total_token_len - (schema_len + total_pdf_token_size)

total_allowed_len


3162

##PASSING SPLITTED DATA TO LLM

In [312]:
start = 0
end = total_pdf_token_size+1

context = []


if pdf_context_len < total_allowed_len:
    context.append(["text",extracted_text])
    context.append(extracted_tables)
else:
    pass
    #chunk it and store
    
context



[['text',
  ['ABC Bank, Inc Syndicated Loan Split Up Bank wise Total Loan Amount : $40,000,000 Chase: $10,000,000 Bank of America: $3,000,000 Goldman Sachs Bank: $7,000,000 Truist Bank: $20,000,000']],
 []]

In [249]:

from groq import Groq
client = Groq(
  api_key="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"
    #  api_key="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"
)
def get_model_response(system_prompt,user_prompt , schema ,model_name):  
    chat_completion = client.chat.completions.create(
    model="llama3-70b-8192",
    messages=[
        {
            "role": "system",
            "content": str(system_prompt)
        },
        {
            "role": "user",
            "content": "Here is the text to parse: " + str(user_prompt)
        },
        {
            "role": "user",
            # "content": "Here is the ouput JSON Schema , make sure you fill all the fields , the fields should not be empty or null  \n [object Object]" + str(schema)
            "content": "Here is the ouput JSON Schema , make sure you fill all the fields , the fields should not be empty or null  \n [object Object]" + str(schema) + ""
        },
        {
            "role": "user",
            # "content": "Here is the ouput JSON Schema , make sure you fill all the fields , the fields should not be empty or null  \n [object Object]" + str(schema)
            "content": """Generate a chunk from the given table to embed and store it in a vector database such that each chunk contains the meaning for the table. Convert this table to a summary, then give the output as chunks to embed and store it in a vector database. Chunks in JSON format.
                give output with markdown with chunk as heading 
                like 
                ##Chunk 1: 
                ##Chunk 2: 
                output : give only the chunks no extract line of text like Here is the output in markdown format with chunks as headings
                Don't include bank_name in each chunk give it as one seperate chunks
                """

        },
        {
        "role": "assistant",
        "content": "Here is the perfectly correctly formatted JSON"
        }
    ],
    temperature=0,
    max_tokens=8192,
    top_p=1,
    stream=False,
    # response_format={"type": "json_object"},
    stop=None,
)  

    return chat_completion



In [313]:
model_name = "llama3-70B-8192"
# model_name = "gemma-7b-it"
system_prompt = "As a genius expert, your task is to understand the content and provide the parsed objects in json that match the following json_schema:"
user_prompt = "Here is the text to parse:" + str(context)
output_formate = ""
response = get_model_response(system_prompt,context , required_schema,model_name)



llm_response_chunks = response.choices[0].message.content

In [314]:
print(llm_response_chunks)

## Bank Name:
{"bank_name": "ABC Bank, Inc"}

## Chunk 1:
{
  "data": [
    {
      "name": "Chase",
      "term_loan_commitment": "$10,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "25%",
      "currency": "USD"
    }
  ]
}

## Chunk 2:
{
  "data": [
    {
      "name": "Bank of America",
      "term_loan_commitment": "$3,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "7.5%",
      "currency": "USD"
    }
  ]
}

## Chunk 3:
{
  "data": [
    {
      "name": "Goldman Sachs Bank",
      "term_loan_commitment": "$7,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "17.5%",
      "currency": "USD"
    }
  ]
}

## Chunk 4:
{
  "data": [
    {
      "name": "Truist Bank",
      "term_loan_commitment": "$20,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "50%",
      "currency": "USD"
    }
  ]
}


## RAG (MARKDOWN CHUNKING)

In [315]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
     ("##", "chunk"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
mark_down_chunks = markdown_splitter.split_text(llm_response_chunks)
mark_down_chunks


[Document(page_content='{"bank_name": "ABC Bank, Inc"}', metadata={'chunk': 'Bank Name:'}),
 Document(page_content='{\n"data": [\n{\n"name": "Chase",\n"term_loan_commitment": "$10,000,000",\n"total_commitment": "$40,000,000",\n"percentage": "25%",\n"currency": "USD"\n}\n]\n}', metadata={'chunk': 'Chunk 1:'}),
 Document(page_content='{\n"data": [\n{\n"name": "Bank of America",\n"term_loan_commitment": "$3,000,000",\n"total_commitment": "$40,000,000",\n"percentage": "7.5%",\n"currency": "USD"\n}\n]\n}', metadata={'chunk': 'Chunk 2:'}),
 Document(page_content='{\n"data": [\n{\n"name": "Goldman Sachs Bank",\n"term_loan_commitment": "$7,000,000",\n"total_commitment": "$40,000,000",\n"percentage": "17.5%",\n"currency": "USD"\n}\n]\n}', metadata={'chunk': 'Chunk 3:'}),
 Document(page_content='{\n"data": [\n{\n"name": "Truist Bank",\n"term_loan_commitment": "$20,000,000",\n"total_commitment": "$40,000,000",\n"percentage": "50%",\n"currency": "USD"\n}\n]\n}', metadata={'chunk': 'Chunk 4:'})]

## EMBEDDING TO CHROMA DB

In [316]:
db.delete_collection()

In [317]:
# CHROMA Embedding 
from langchain_community.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings



db = Chroma.from_documents(documents=mark_down_chunks, embedding=SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"))

##RITRIVAL

In [318]:
llm_prompt_schema  = """{{\"type\":\"object\",\"properties\":{{\"bank_name\":{{\"type\":\"string\",\"description\":\"Name of the company to which the PDF belongs\"}},\"data\":{{\"type\":\"array\",\"items\":{{\"type\":\"object\",\"properties\":{{\"name\":{{\"type\":\"string\",\"description\":\"Name of the lender\"}},\"term_loan_commitment\":{{\"type\":\"string\",\"description\":\"A term loan commitment is a lender's obligation to contribute their portion of the term loan to the borrower\"}},\"total_commitment\":{{\"type\":\"string\",\"description\":\"Total commitment of the lender to the project\"}},\"percentage\":{{\"type\":\"string\",\"description\":\"Percentage of the total commitment of the lender to the project\"}},\"currency\":{{\"type\":\"string\",\"description\":\"Currency of the commitment amount\"}},\"required\":[\"name\",\"term_loan_commitment\",\"total_commitment\",\"percentage\",\"currency\"]}}}},\"required\":[\"bank_name\",\"data\"]}}"""

In [319]:

from langchain.prompts import ChatPromptTemplate, PromptTemplate
QUERY_PROMPT = PromptTemplate(
    input_variables=["questions"],
    template="Extract all the following values :" + llm_prompt_schema
)

In [320]:
#getting only the chunks that are similar to the query for llm to produce the output
from langchain_groq import ChatGroq
from langchain.retrievers.multi_query import MultiQueryRetriever
llm = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-70b-8192")

retriever = MultiQueryRetriever.from_llm(
    db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)



In [321]:


from langchain_groq import ChatGroq

template = """ As a genius expert, your task is to understand the content and provide the parsed objects in json that match the following json_schema:
        give json for only asked question
        Here is the context :
        context = {context}
        Here is the output JSON Schema , make sure you fill all the fields , the fields should not be empty or null :
        required schema {questions}
         """


prompt = ChatPromptTemplate.from_template(template)

In [323]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
              {"context": retriever, "questions": RunnablePassthrough() }
              | prompt
              | llm
              | StrOutputParser()
          )

In [324]:
res = rag_chain.invoke({'questions ' : llm_prompt_schema})  
  

In [325]:
print(res)

Based on the provided context and JSON schema, I will parse the content and provide the output in JSON format. Here is the parsed output:

```
{
  "bank_name": "ABC Bank, Inc",
  "data": [
    {
      "name": "Goldman Sachs Bank",
      "term_loan_commitment": "$7,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "17.5%",
      "currency": "USD"
    },
    {
      "name": "Bank of America",
      "term_loan_commitment": "$3,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "7.5%",
      "currency": "USD"
    },
    {
      "name": "Chase",
      "term_loan_commitment": "$10,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "25%",
      "currency": "USD"
    },
    {
      "name": "Truist Bank",
      "term_loan_commitment": "$20,000,000",
      "total_commitment": "$40,000,000",
      "percentage": "50%",
      "currency": "USD"
    }
  ]
}
```

Note that I've combined the data from all the documents into a single JSON obje

In [326]:
file_path = "./outputs/Syndicated.json"

# Write JSON schema string to file
with open(file_path, "w") as file:
    file.write(res)

print("JSON schema written to file:", file_path)

JSON schema written to file: ./output/Syndicated.json
