In [6]:
import os
import subprocess
from pdf2image import convert_from_path, exceptions
from PIL import Image
from io import BytesIO
import base64
from IPython.display import HTML, display
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from gradio_client import Client, handle_file
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain.docstore.document import Document
from langchain_groq import ChatGroq


import google.generativeai as genai
genai.configure(api_key="AIzaSyAmGVxFhJYvzV07mBk4IbiYHbulwzN0Gb0")
model = genai.GenerativeModel('gemini-1.5-flash')


In [2]:

# Initialize the Ollama client
ollama = Ollama(
    base_url='http://localhost:11434',
    model="llava:34b",
    temperature=0.0
)

def convert_to_base64(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def convert_pdf_to_images(pdf_file, output_folder):
    try:
        images = convert_from_path(pdf_file)
        image_paths = []
        for i, image in enumerate(images):
            image_path = os.path.join(output_folder, f'slide_{i + 1}.png')
            image.save(image_path, 'PNG')
            image_paths.append(image_path)
        return image_paths
    except (exceptions.PDFInfoNotInstalledError, exceptions.PDFPageCountError, exceptions.PDFSyntaxError) as e:
        print(f"Error converting PDF to images: {e}")
        raise


def process_pdf(pdf_file, output_folder):
    try:
        image_paths = convert_pdf_to_images(pdf_file, output_folder) 
        print(image_paths)
        print(f"Extracted {len(image_paths)} images from {pdf_file} to {output_folder}")     
        return image_paths
    except Exception as e:
        print(f"Error processing PDF {pdf_file}: {e}")
        raise




In [4]:
file_path = "attention_is_all_you_need.pdf"
filename = os.path.basename(file_path)
output_folder = "extracted_pngimages_fromPDF"
os.makedirs(output_folder, exist_ok=True)

In [5]:
if file_path.lower().endswith('.pdf'):
    image_files = process_pdf(file_path, output_folder)
else:
    raise ValueError("PDF please.")

['extracted_pngimages_fromPDF\\slide_1.png', 'extracted_pngimages_fromPDF\\slide_2.png', 'extracted_pngimages_fromPDF\\slide_3.png', 'extracted_pngimages_fromPDF\\slide_4.png', 'extracted_pngimages_fromPDF\\slide_5.png', 'extracted_pngimages_fromPDF\\slide_6.png', 'extracted_pngimages_fromPDF\\slide_7.png', 'extracted_pngimages_fromPDF\\slide_8.png', 'extracted_pngimages_fromPDF\\slide_9.png', 'extracted_pngimages_fromPDF\\slide_10.png', 'extracted_pngimages_fromPDF\\slide_11.png', 'extracted_pngimages_fromPDF\\slide_12.png', 'extracted_pngimages_fromPDF\\slide_13.png', 'extracted_pngimages_fromPDF\\slide_14.png', 'extracted_pngimages_fromPDF\\slide_15.png']
Extracted 15 images from attention_is_all_you_need.pdf to extracted_pngimages_fromPDF


In [8]:
def structure_ocr_with_llm(ocr_text):
    
    prompt = PromptTemplate(
    input_variables=["ocr_text"],
    template="""
    Given is the OCR text from a full page, please return a structured output with out missing any information. Please do not add any information that is not in the OCR text.
    OCR Text:
    {ocr_text}
    """
    )
    print(prompt)
    
    # llm = Ollama(base_url='http://localhost:11434', model="llava:34b", temperature=0.0)
    llm = ChatGroq(temperature=0, model_name="llama3-8b-8192", groq_api_key="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" )

    
    chain = prompt | llm | StrOutputParser()     
    structured_ocr = chain.invoke(ocr_text)
    
    
    print("Structured text from OCR:", structured_ocr)
    return structured_ocr

In [9]:
documents = []
structured_ocr_text = []
florence_ocr_client = Client("gokaygokay/Florence-2")
for page_number, image_file in enumerate(image_files, start=1):
    result = florence_ocr_client.predict(
    image=handle_file(image_file),
    task_prompt="OCR",
    text_input=None,
    model_id="microsoft/Florence-2-large",
    api_name="/process_image"
    )
    ocr_text = result[0]
    print(ocr_text)

    structured_text_fromLLM = structure_ocr_with_llm(ocr_text)
    structured_ocr_text.append(structured_text_fromLLM)
    
    print("Structured OCR:", structured_ocr_text)
    
    # metadata = {"filename": filename, "page_number": page_number}
    # doc = Document(page_content="".join(ocr_text), metadata=metadata)
    # documents.append(doc)

Loaded as API: https://gokaygokay-florence-2.hf.space ✔
{'<OCR>': '\nProvided proper attribution is provided, Google hereby grants permission toreproduce the tables and figures in this paper solely for use in journalistic orscholarly works.Attention Is All You NeedAshish Vaswani\'Noam Shawezer\'Nikki Parmar\'Jakob Uszkoreit\'Google BrainGoogle Researchavaswan@google.comnoam@Google.comnikipogogle.comGoogle Researchuszgogle.comLion Jones\'Aidan N. Comer\'Lukasz Kaiser\'Google ResearchUniversity of TorontoGoogle Brain11lion@google.comaidananda, toronto.edulukaszkaiser@google_comIllia Poloshkin\'11lila.poloshkin@gmail.comAbstractThe dominant sequence translation models are based on complex recurrent orconvolutional neural networks that include an encoder and a decoder. The bestperformational neural networks are a new network architecture, the Transformermechanism, and the idea. A new network structure, the Transformormechanism, and its ability to communicate with each other. Based solely o

In [10]:
structured_ocr_text

['Here is the structured output:\n\n**Title:** Attention Is All You Need\n\n**Authors:**\n\n1. Ashish Vaswani\n2. Noam Shawezer\n3. Nikki Parmar\n4. Jakob Uszkoreit\n5. Lion Jones\n6. Aidan N. Comer\n7. Lukasz Kaiser\n\n**Affiliations:**\n\n1. Google Brain\n2. Google Research\n3. University of Toronto\n\n**Contact Information:**\n\n1. Ashish Vaswani: avaswan@google.com\n2. Noam Shawezer: noam@Google.com\n3. Nikki Parmar: nikipogogle.com\n4. Jakob Uszkoreit: uszgogle.com\n5. Lion Jones: lion@google.com\n6. Aidan N. Comer: aidananda, toronto.edu\n7. Lukasz Kaiser: lukaszkaiser@google.com\n8. Illia Poloshkin: lila.poloshkin@gmail.com\n\n**Abstract:**\n\nThe dominant sequence translation models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performance is achieved by a new network architecture, the Transformer mechanism, and the idea. A new network structure, the Transformer mechanism, and its ability to communicate with each

In [11]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq


# Generate summaries of text elements
def generate_text_summaries(texts, summarize=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize: Bool to summarize or not
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text elements. \
    Give a detailed summary of the text that is well optimized for retrieval. you must not provide something like 'the text describe about or the text is about or Here is a concise summary of the text'. You should be straight to the point with summarization. Please provide a summary of the text: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatGroq(temperature=0, model_name="llama3-8b-8192", groq_api_key="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" )
    
    # model = ChatOllama(model="llama", base_url="http://localhost:11434", temperature=0.0)


    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
    
    # Initialize empty summaries
    text_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
    elif texts:
        text_summaries = texts


    return text_summaries


# Get text, table summaries
text_summaries = generate_text_summaries(structured_ocr_text,  summarize=True)

In [12]:
text_summaries

['Here is a detailed summary of the text optimized for retrieval:\n\n**Transformer Architecture**: A new network structure, Transformer, replaces recurrent and convolutional neural networks for sequence translation, achieving state-of-the-art performance.\n\n**Key Features**: Transformer mechanism, parallelizable, requires less training time, and eliminates recurrence and convolution.\n\n**Achievements**: Establishes a single-language state-of-the-art BLEU score of 41.8, surpassing previous models.\n\n**Author Contributions**: Ashish, Jakob, and Ashish developed and implemented Transformer models and attention; Niki designed, implemented, and evaluated models; Lukas and Aidan designed and visualized various parts of the research.\n\n**Paper Information**: Paper ID: arXiv.1706.03760, Category: cs.CLI, Date: 2 Aug 2023.\n\nThis summary provides a concise and informative overview of the text, highlighting the key points and achievements of the Transformer architecture, while also mentioni

In [13]:
## extract tables like images from pdf

In [14]:
#then get summary from openbmb for that table
