# AI

In [None]:
# Prepaid your env for Qwen/Qwen2.5-VL-3B-Instruct first!
# pip install git+https://github.com/huggingface/transformers accelerate
# pip install minio load-dotenv pdfplumber qwen-vl-utils[decord]==0.0.8

In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from PIL import Image
from minio import Minio
from dotenv import load_dotenv
from utils import get_txt_from_doc, pdf_to_images, clean_text

import pdfplumber
import torch
import re
import pickle
import io
import os
import logging


In [2]:
logging.basicConfig(filename='app.log', level=logging.INFO)

load_dotenv()

YANDEX_CLOUD_ACCESS_KEY = os.getenv("YANDEX_CLOUD_ACCESS_KEY")
YANDEX_CLOUD_SECRET_KEY = os.getenv("YANDEX_CLOUD_SECRET_KEY")

In [5]:
BUCKET_NAME = 'rag-project' # s3

# RTX 4070S Ti 
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda:0'

# 1. Prepad minio, pdf, model

In [4]:
# Model
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# default processer
processor = AutoProcessor.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
client = Minio(
    "storage.yandexcloud.net",
    access_key=YANDEX_CLOUD_ACCESS_KEY,
    secret_key=YANDEX_CLOUD_SECRET_KEY,
    secure=True
)

In [7]:
# Load pdf from s3: reestr of documents for RAG - dict like {paper name : file name}
client.fget_object(
    bucket_name=BUCKET_NAME,
    object_name='paper_dict.pkl',
    file_path='/content/paper_dict.pkl'
    )

with open("/content/paper_dict.pkl", "rb") as file:
    paper_dict = pickle.load(file)

paper_dict

{'DeepSeek-V3': '2412.19437.pdf',
 'LIMO: Less is More for Reasoning': '2502.03387.pdf',
 'DeepSeek-R1': '2501.12948v1.pdf',
 'Nature-Inspired Population-Based Evolution of Large Language Models': '2503.01155v1.pdf',
 'START: Self-taught Reasoner with Tools': '2503.04625v2.pdf',
 'Large Language Diffusion Models': '2502.09992v2.pdf',
 'Proving Olympiad Inequalities by Synergizing LLMs and Symbolic Reasoning': '2502.13834v3.pdf',
 'STP: Self-play LLM Theorem Provers with Iterative Conjecturing and Proving': '2502.00212v3.pdf',
 'MathConstruct: Challenging LLM Reasoning with Constructive Proofs': '2502.10197v1.pdf',
 'Training Language Models for Social Deduction with Multi-Agent Reinforcement Learning': '2502.06060v1.pdf',
 'LLMs Can Easily Learn to Reason from Demonstrations Structure, not content, is what matters!': '2502.07374v2.pdf',
 'Competitive Programming with Large Reasoning Models': '2502.06807v2.pdf',
 "MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Pe

# 2. Get text from PDF

In [8]:
# Prepared promt
prompt = "Convert a document from an image to a text format, add detail information from the charts and table to text, dont write info not from document."

In [9]:
# Pipeline: PDF -> Image -> {page_num:Text} -> S3
for paper_name, doc_name in paper_dict.items():
    if paper_name in ( 
        'DeepSeek-V3', 'LIMO: Less is More for Reasoning', 'DeepSeek-R1'
        ):
        continue

    paper_name = paper_name.replace(' ', '_').replace('-', '_') # 2501.6589 -> 2501_6589   

    logging.info('Start ------------------------------------------')
    logging.info(f'paper_name: {paper_name}, doc_name: {doc_name}, dict_name "pdf_"+{doc_name}')

    logging.info('1. Convert PDF page to image')
    images = pdf_to_images(pdf_path=f'papers/{doc_name}')

    # Create dict for paper, where keys - number of pages, value - content from page
    doc_name = doc_name[:-4].replace('.', '_') # delete .pdf
    pdf_name = 'pdf_' + doc_name
    globals()[pdf_name] = dict() 

    logging.info('2. Get text from paper whit Qwen2.5-VL-3B-Instruct')
    for pg_num, page in enumerate(images):
        output_text = get_txt_from_doc(
            image=page,
            prompt=prompt,
            model=model,
            processor=processor,
            device=DEVICE
            )
        text = clean_text(output_text[0])

        # page_num : parse_text
        globals()[pdf_name][pg_num] = text

    logging.info('3. Put dict whits paper content to s3')
    pickle_data = pickle.dumps(globals()[pdf_name]) # Name like f"pdf_{id_paper}",replace('.', '_') -> pdf_2502_03793v2
    object_key = pdf_name + '.pdf'

    client.put_object(
        bucket_name=BUCKET_NAME, 
        object_name=object_key, 
        data=io.BytesIO(pickle_data), 
        length=len(pickle_data), 
        content_type="application/octet-stream"
        )

# 3. Check

In [10]:
# Load text for one paper (from s3)
client.fget_object(
    bucket_name=BUCKET_NAME, 
    object_name='pdf_2502_03793v2.pdf',
    file_path='/content/pdf_2502_03793v2.pdf'
    )

with open("/content/pdf_2502_03793v2.pdf", "rb") as file:
    pdf_2502_03793v2_s3 = pickle.load(file)

# type == dict
pdf_2502_03793v2_s3

{0: "Abstract\nWhile encoder-only models such as BERT and ModernBERT are ubiquitous in real-world NLP applications, their conventional reliance on task-specific classification heads can limit their applicability compared to decoder-based large language models (LLMs). In this work, we introduce ModernBERT-Large-Instruct, a 0.4B-parameter encoder model that leverages its masked language modeling (MLM) head for generative classification. Our approach employs an intentionally simple training loop and inference mechanism that requires no heavy pre-processing, highly engineered prompting, or architectural modifications. ModernBERT-Large-Instruct exhibits strong zero-shot performance on both classification and knowledge-based tasks, outperforming similarly sized LLMs on MMLU and achieving 93% of Llama-3.1B's MMLU performance with 60% less parameters. We also demonstrate that, when fine-tuned, the generative approach using the MLM head matches or even surpasses traditional classification-head 

In [11]:
# Load text for one paper (from s3)
name_object = 'pdf_2502_07374v2.pdf'
client.fget_object(
    bucket_name=BUCKET_NAME, 
    object_name=name_object,
    file_path=f'/content/{name_object}'
    )

with open(f"/content/{name_object}", "rb") as file:
    pdf_2502_07374v2_s3 = pickle.load(file)

# type == dict
pdf_2502_07374v2_s3

{0: 'Abstract\nLarge reasoning models (LRMs) tackle complex reasoning problems by following long chain-of-thoughts (Long CoT) that incorporate reflection, backtracking, and self-validation. However, the training techniques and data requirements to elicit Long CoT remain poorly understood. In this work, we find that a Large Language Model (LLM) can effectively learn Long CoT reasoning through data-efficient supervised fine-tuning (SFT) and parameter-efficient low-rank adaptation (LoRA). With just 17k long CoT training samples, the Qwen2.5-32B-Instruct model achieves significant improvements on a wide range of math and coding benchmarks, including 56.7% (+40.0%) on AIME 2024 and 57.0% (+8.1%) on LiveCodeBench, competitive with the proprietary o1-preview model\'s score of 44.6% and 59.1%.\nMore importantly, we find that the structure of Long CoT is critical to the learning process, whereas the content of individual reasoning steps has minimal impact. Perturbations affecting content, such 