# PDF image extraction

In [22]:
import fitz
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import io
import base64


def pil_image_to_base64(pil_images, format="PNG"):
    img_base64_list = []
    for image in pil_images:
        buffer = io.BytesIO()
        image.save(buffer, format=format)
        img_bytes = buffer.getvalue()
        img_base64 = base64.b64encode(img_bytes).decode("utf-8")
        img_base64_list.append(img_base64)
    return img_base64_list


def extract_images_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    images = []

    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))

            # image_file = f"extracted_image_{page_number+1}_{image_index+1}.{image_ext}"
            images.append(image)

    return images


def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text


def find_top_k_length(list_text, top_k=2):
    sorted_indexs = sorted(
        range(len(list_text)), key=lambda i: list_text[i], reverse=True
    )
    top_k_indexs = sorted_indexs[:top_k]
    return top_k_indexs, [images[i] for i in top_k_indexs]


# PDF Extraction testing for detecting the length
pdf_path = "test.pdf"
images = extract_images_from_pdf(pdf_path)
text_length_list = []
for img in images:
    text = extract_text_from_image(img)
    text_length_list.append(len(text))

top_k = 2
top_k_indexs, top_k_images = find_top_k_length(list_text=text_length_list, top_k=top_k)

# Extract valuatble Text from image -  Test_On_MiniCPM with corresponding json prompt

In [30]:
top_k_images_path = []
for i in range(top_k):
    path = f"temp/temp_{i}.jpg"
    top_k_images_path.append(path)
    top_k_images[i].save(path)

In [2]:
import os
import sys
import ollama

prompt_extraction = """Extract all of the text in the image."""


def run_inference(model: str, image_paths: str):
    stream = ollama.chat(
        model=model,
        messages=[
            {
                "role": "user",
                "content": prompt_extraction,
                "images": [image_paths],
            }
        ],
        stream=True,
    )

    for chunk in stream:
        print(chunk["message"]["content"], end="", flush=True)


run_inference("minicpm-v:8b", "temp/temp_0.jpg")

**Title:**
"未来世界·华语童声”国际青少年艺术展演亮相首届教科文电影遗产国际电影节

**Subtitle:** 
欧洲时报 华侨周末 2023年12月1日-3日

**Date:** 
周五一星期五 日 - 星期六 日 (01 Dec 2023)

**Body Text:**
"未来世界·华语童声”国际青少年艺术展演亮相首届教科文电影遗产国际电影节
(欧洲时报记者/李倩)为纪念联合国教科文组织设立世界电影日1954年12月7日，中国电影资料馆主办的“未来世界·华语童声”国际青少年艺术展演于10月3日在国家大剧院举行。活动以弘扬中华优秀传统文化和传播电影文化为核心，旨在通过音乐、戏剧、舞蹈等艺术形式展示青少年的艺术才华与创造力。

**Additional Text:**
- 李倩：中国未来世界·华语童声艺术展演
  - 这次展演是首次在国家大剧院举办，吸引了来自全国各地的60余位青少年演员参与。
  
- 郑凯：教科文组织副部长李明华致辞
  - 她表示，此次展演活动不仅有助于推广中华优秀传统文化，也有助于促进中外文化交流。

**Images Description:**
The images depict various scenes from the event, including participants performing on stage and engaging with audiences.

- Image captions include:
  - "郑凯：教科文组织副部长李明华致辞"
  - "未来世界·华语童声艺术展演”
  - "青少年演员在舞台上表演"
  - "观众积极参与互动"

**Footer:**
(底部有彩条和色标，表明这是一份报纸的版面)

(Note: The text has been transcribed as accurately as possible from the image, preserving names and titles.)

In [24]:
import requests

top_k_images_base_64 = pil_image_to_base64(top_k_images)

url = "http://localhost:11434/api/generate"
data = {
    "model": "minicpm-v:8b",
    "messages": [
        {
            "role": "user",
            "content": "What's the main information about these photos",
            "images": top_k_images_base_64,
        }
    ],
}


response = requests.post(url, json=data)

# Print the response
if response.status_code == 200:
    print("Response:", response.json())
else:
    print(f"Failed to get a response. Status code: {response.status_code}")

Response: {'model': 'minicpm-v:8b', 'created_at': '2024-09-27T10:28:41.138506Z', 'response': '', 'done': True, 'done_reason': 'load'}


In [1]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained(
    "openbmb/MiniCPM-V-2_6",
    trust_remote_code=True,
    attn_implementation="sdpa",
    torch_dtype=torch.bfloat16,
)  # sdpa or flash_attention_2, no eager
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
    "openbmb/MiniCPM-V-2_6", trust_remote_code=True
)

image = Image.open("test_image.jpg").convert("RGB")
question = "What is in the image?"
msgs = [{"role": "user", "content": [image, question]}]

res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
print(res)

## if you want to use streaming, please make sure sampling=True and stream=True
## the model.chat will return a generator
res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=True, stream=True)

generated_text = ""
for new_text in res:
    generated_text += new_text
    print(new_text, flush=True, end="")

: 