In [1]:
import re
import base64
import torch
from markitdown import MarkItDown
from PIL import Image
from io import BytesIO
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from tqdm.notebook import trange

In [2]:
m_path = 'Qwen/Qwen2.5-VL-3B-Instruct'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    m_path,
    dtype='auto',
    device_map='auto'
)
processor = AutoProcessor.from_pretrained(
    m_path,
    use_fast=True
)

model = torch.compile(
    model,
    mode='max-autotune',
    fullgraph=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [3]:
md = MarkItDown()
result = md.convert(
    './__input__/Учебник_философии_22_августа_ТюмГУ.docx',
    keep_data_uris=True
)

In [4]:
with open('./__output__/study_fies.md', 'w', encoding='utf-8') as f:
    f.writelines(result.text_content)

In [5]:
def get_uri(full_match: str) -> str:
    uri_mask = r'(!\[\]\(data:image/(png|jpeg);base64,)|(\))'
    return re.sub(uri_mask, '', full_match)


def get_image(uri: str) -> Image.Image:
    return Image.open(
        BytesIO(
            base64.b64decode(uri)
        )
    )

In [6]:
image_mask = r'!\[\]\(data:image/(png|jpeg);base64,.+\)'
iters = re.finditer(image_mask, result.text_content)

visited = set()
images = []

for i, img_str in enumerate(iters):
    img_str = img_str.group()

    if img_str not in visited:
        uri = get_uri(img_str)
        img  = get_image(uri)
        img_name = f'image_{i}.png'

        images.append((
            img_name,
            img,
            img_str,
        ))
        visited.add(img_str)

In [7]:
def get_msg(img: Image.Image):
    messages = [
        {
            'role': 'user',
            'content': [
                {
                    'type': 'image',
                    'image': img,
                },
                {
                    'type': 'text',
                    'text': 'Translate image to text, save core meaning, include all text from image in your responce'
                },
            ],
        }
    ]

    return messages


def get_input(messages):
    texts = [
        processor.apply_chat_template(
            msg,
            tokenize=False,
            add_generation_prompt=True
        )
        for msg in messages
    ]
    image_inputs, _ = process_vision_info(messages)

    return processor(
        text=texts,
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model.device)


def get_images_str(imgs: list[Image.Image], batch_size=4):
    res = []
    for i in trange(0, len(imgs), batch_size):
        batch = imgs[i:i+batch_size]
        msgs = [
            get_msg(img)
            for img in batch
        ]
        inputs = get_input(msgs)

        generated_ids = model.generate(
            **inputs,
            max_new_tokens=256
        )
        generated_ids_trimmed = [
            out_ids[len(in_ids) :]
            for in_ids, out_ids in zip(
                inputs.input_ids,
                generated_ids
            )
        ]
        res.extend((
            t.replace('\n', '. ')
            for t in processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )
        ))

    return res

In [8]:
data = [
    img
    for _, img, _ in images
]
translations = get_images_str(data, batch_size=3)

translations

  0%|          | 0/5 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


['ДНК РОССИИ БИБЛИОТЕКА ПРОЕКТА',
 'The image depicts a document with checkmarks and a gear icon. The document appears to be a checklist or a list of items, indicated by the checkmarks. The gear icon suggests that this is related to settings or configuration, possibly indicating a process for managing or adjusting settings based on the checklist.',
 'The image depicts a document with checkmarks and a gear icon. The document appears to be a checklist or a list of items, indicated by the checkmarks. The gear icon suggests that this is related to settings or configuration, possibly indicating a process for managing or adjusting settings based on the checklist.',
 'И',
 'Новое знание. . Индивидуальное. . Сознание. Личность. I Человек. Тело. Свобода. . Сохранение. . Вера (Надежда). . Размножение. . Коллективное. . Методы научного познания. АСТ. Герменевтика. Феноменология. Диалектика. . Методология. Логика. Аргументация. . IV Язык. . IX История. Современность. Цивилизация. . VIII Природа. К

In [9]:
txt = result.text_content
for (file_name, _, orig_str), text in zip(images, translations):
    new_str = f'![{text}](media/{file_name})'
    txt = txt.replace(orig_str, new_str)

In [10]:
with open('./__output__/study_fies_no_uri.md', 'w', encoding='utf-8') as f:
    f.writelines(txt)

In [11]:
for file_name, img,_ in images:
    img.save(f'./__output__/media/{file_name}')