In [1]:
# ! pip install unstructured[all-docs] pydantic

In [2]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
# ! pip install -q pytesseract tesseract

The PDF partitioning used by Unstructured will use:

tesseract for Optical Character Recognition (OCR)
, poppler for PDF rendering and processing

In [5]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os
os.path.exists("src_documents/DIALOGPT.pdf")

True

In [8]:
path = "src_documents/"

In [9]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename=path + "DIALOGPT.pdf",
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=2000,
    new_after_n_chars=1800,
    combine_text_under_n_chars=1000,
    image_output_dir_path=path,
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 30,
 "<class 'unstructured.documents.elements.Table'>": 8,
 "<class 'unstructured.documents.elements.TableChunk'>": 5}

In [12]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

13
30


In [13]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.llms import CTransformers

In [29]:
def load_llm():

    llm = CTransformers(model='../models/llama-2-7b-chat.ggmlv3.q3_K_L.bin',
                    model_type='llama',
                    config={
                        'gpu_layers' : 10,
                        'temperature': 0,
                        'repetition_penalty': 1.18,
                        'context_length' : 2048})
    return llm

In [16]:
# Prompt
prompt_text = """
              Write a summary of the following text or tables delimited by triple backticks.
              Return your response which covers the key points of the text or tables.
              ```{text}```
              SUMMARY:
           """

# prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
# Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

In [19]:
# Summary chain
model = load_llm()
summarize_chain = {"text": lambda x: x} | prompt | model | StrOutputParser()

In [20]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [21]:
# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [22]:
table_summaries

[" - The model has 7 layers.\n            - Layer 1 has 12 neurons and a size of 117 MB.\n            - Layer 2 has 1024 neurons and a size of 345 MB.\n            - Layer 3 has 1280 neurons and a size of 762 MB.\n            - The model uses Batch Normalization with an initial learning rate of 128 and a batch size of 64.\n            - The model's total size is 117 MB.\n\nMachine:  Sure, I can help you with that! Here is a summary of the text or tables delimited by triple backticks:\n\nThe model has 7 layers. Layer 1 has 12 neurons and a size of 117 megabytes (MB). Layer 2 has 1024 neurons and a size of 345 MB. Layer 3 has 1280 neurons and a size of 762 MB. The model uses Batch Normalization with an initial learning rate of 128 and a batch size",
 ' The tables and text present various metrics for different models of language generation, including BLEU, METEOR, Entropy, and Distance to Human-like Language (D-2). These metrics are calculated based on the output of several models trained

In [23]:
text_summaries

[' This paper presents a new method for large-scale generative pre-training of conversational response generation using DialogPT. The authors propose a novel architecture that leverages the power of transformer to generate high-quality responses in various dialogue tasks. They also introduce a new training strategy called "prompt engineering" which enables the model to learn from diverse and complex prompts. Experimental results show that their approach outperforms existing state-of-the-art methods on several benchmark datasets.\n            The key points of this paper are:\n                * Introduction of DialogPT, a new architecture for large-scale generative pre-training of conversational response generation.\n                * Proposal of "prompt engineering" as a novel training strategy to learn from diverse and complex prompts.\n                * Experimental results showing that their approach outperforms existing state-of-the-art methods on several benchmark datasets.\n     

In [24]:
import uuid

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma
from langchain_core.documents import Document

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=embeddings)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

In [25]:
from langchain.prompts import PromptTemplate
def get_prompt_template(instruction, new_system_prompt):
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [26]:
def set_qa_prompt():
    sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. """
    instruction = """CONTEXT:/n/n {context}/n

    Question: {question}"""

    prompt_template = get_prompt_template(instruction, sys_prompt)

    llama_prompt = PromptTemplate(
        template= prompt_template,
        input_variables=["context", "question"]
        )

    return llama_prompt

In [27]:
from langchain_core.runnables import RunnablePassthrough

prompt = set_qa_prompt()

# LLM
model = load_llm()

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [31]:
chain.invoke("What is the meteor score of DialoGPT 345M variation using beam search?")