In [None]:
# prepaid your env
# pip install -U FlagEmbedding

In [4]:
# from FlagEmbedding import BGEM3FlagModel
from minio import Minio
from dotenv import load_dotenv
# from transformers import AutoTokenizer
# from utils import merge_paragraphs, smart_chunking, split_large_paragraph

import logging
import pickle
import os

In [5]:
load_dotenv()

YANDEX_CLOUD_ACCESS_KEY = os.getenv("YANDEX_CLOUD_ACCESS_KEY")
YANDEX_CLOUD_SECRET_KEY = os.getenv("YANDEX_CLOUD_SECRET_KEY")
BUCKET_NAME = 'rag-project' # s3

logging.basicConfig(filename='app_t2e.log', level=logging.INFO)

# 1. Load model and docs

In [5]:
# Convert text to embeding whit BAAI/bge-m3
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) 

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [6]:
client = Minio(
    "storage.yandexcloud.net",
    access_key=YANDEX_CLOUD_ACCESS_KEY,
    secret_key=YANDEX_CLOUD_SECRET_KEY,
    secure=True
)

In [10]:
# Load pdf from s3: reestr of documents for RAG - dict like {paper name : file name}
client.fget_object(
    bucket_name=BUCKET_NAME,
    object_name='paper_dict.pkl',
    file_path='paper_dict.pkl'
    )

with open("paper_dict.pkl", "rb") as file:
    paper_dict = pickle.load(file)

paper_dict

{'DeepSeek-V3': '2412.19437.pdf',
 'LIMO: Less is More for Reasoning': '2502.03387.pdf',
 'DeepSeek-R1': '2501.12948v1.pdf',
 'Nature-Inspired Population-Based Evolution of Large Language Models': '2503.01155v1.pdf',
 'START: Self-taught Reasoner with Tools': '2503.04625v2.pdf',
 'Large Language Diffusion Models': '2502.09992v2.pdf',
 'Proving Olympiad Inequalities by Synergizing LLMs and Symbolic Reasoning': '2502.13834v3.pdf',
 'STP: Self-play LLM Theorem Provers with Iterative Conjecturing and Proving': '2502.00212v3.pdf',
 'MathConstruct: Challenging LLM Reasoning with Constructive Proofs': '2502.10197v1.pdf',
 'Training Language Models for Social Deduction with Multi-Agent Reinforcement Learning': '2502.06060v1.pdf',
 'LLMs Can Easily Learn to Reason from Demonstrations Structure, not content, is what matters!': '2502.07374v2.pdf',
 'Competitive Programming with Large Reasoning Models': '2502.06807v2.pdf',
 "MATH-Perturb: Benchmarking LLMs' Math Reasoning Abilities against Hard Pe

In [18]:
doc_name = paper_dict['DeepSeek-V3']

In [29]:
doc_name = doc_name[:-4].replace('.', '_') + '.pdf'

'2412_19437.pdf'

In [26]:
doc_name

'2412_19437.pdf'

In [None]:
pdf_2412_19437.pdf
pdf_2412.19437.pdf

In [28]:
'pdf_'+doc_name

'pdf_2412.19437.pdf'

In [30]:
client.fget_object(
    bucket_name=BUCKET_NAME, 
    object_name='pdf_'+doc_name,
    file_path=f'{doc_name}'
    )

with open(f"{doc_name}", "rb") as file:
    cur_pdf = pickle.load(file)

In [31]:
cur_pdf

{0: 'DeepSeek-V3 Technical Report\nDeepSeek-AI\nresearch@deepseek.com\nAbstract\nWe present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its fu

# 2. Pipeline for convert text to emdedings

In [61]:
dict_all_embed = dict()

for paper_name, doc_name in paper_dict.items():
   
    logging.info('Start ------------------------------------------')
    logging.info(f'Paper_name: {paper_name}, doc_name: {doc_name}')

    doc_name = doc_name.replace('.', '_').replace('_pdf', '.pdf')
    name_object = 'pdf' + doc_name

    logging.info(f'1. Load from S3')
    client.fget_object(
        bucket_name=BUCKET_NAME, 
        object_name='pdf_'+doc_name,
        file_path=f'/content/{name_object}'
        )

    with open(f"/content/{name_object}", "rb") as file:
        cur_pdf = pickle.load(file)

    logging.info(f'2. Concat full text of paper')
    full_txt_cur_pdf = ''
    for page in cur_pdf.values():
        full_txt_cur_pdf += f' ({paper_name})' + page

    len_p = len(full_txt_cur_pdf.split('\n'))
    logging.info(f'--- f"Cnt paragraf: {len_p}')
    len_w = len(full_txt_cur_pdf)
    logging.info(f'--- f"Word cnt: {len_w}')

    logging.info(f'3. Get chunks')
    chunks_cur_pdf = smart_chunking(
        paragraphs_text=full_txt_cur_pdf.split('\n'), 
        tokenizer=tokenizer
        )

    len_c = len(chunks_cur_pdf)
    logging.info(f'--- f"Cnt chunks: {len_c}')

    logging.info(f'4. Get embeding')
    output_model = model.encode(
            chunks_cur_pdf, 
            batch_size=24, 
            max_length=1024,
            )['dense_vecs']
    
    logging.info(f'--- f"Word cnt: {output_model.shape}')
    logging.info(f'--- f"Similarity ders & second page: {output_model[0] @ output_model[1]}')
    logging.info(f'--- f"Similarity ders & last page: {output_model[0] @ output_model[-1]}')
    
    logging.info(f'5. Save embedings')
    dict_all_embed[doc_name] = [chunks_cur_pdf, output_model, paper_name]


In [64]:
for name, embeds in dict_all_embed.items():
    print(name, embeds[1].shape)

2412_19437.pdf (57, 1024)
2502_03387.pdf (169, 1024)
2501_12948v1.pdf (21, 1024)
2503_01155v1.pdf (24, 1024)
2503_04625v2.pdf (38, 1024)
2502_09992v2.pdf (31, 1024)
2502_13834v3.pdf (34, 1024)
2502_00212v3.pdf (28, 1024)
2502_10197v1.pdf (30, 1024)
2502_06060v1.pdf (17, 1024)
2502_07374v2.pdf (21, 1024)
2502_06807v2.pdf (40, 1024)
2502_06453v2.pdf (25, 1024)
2502_03793v2.pdf (16, 1024)


In [63]:
dict_all_embed['2502_03387.pdf'][1]

array([[-0.04214 , -0.01968 , -0.03366 , ..., -0.04382 ,  0.04047 ,
         0.00379 ],
       [-0.03223 , -0.00601 , -0.00564 , ..., -0.0243  ,  0.05484 ,
         0.00716 ],
       [-0.03903 ,  0.009674, -0.002354, ..., -0.02846 ,  0.0701  ,
         0.004013],
       ...,
       [-0.001999,  0.02495 , -0.0363  , ..., -0.0241  ,  0.03128 ,
         0.01354 ],
       [-0.01012 ,  0.03992 , -0.02344 , ..., -0.02553 ,  0.03745 ,
         0.0172  ],
       [-0.02953 , -0.02821 , -0.0337  , ..., -0.00504 ,  0.0402  ,
         0.02956 ]], shape=(169, 1024), dtype=float16)

# 3. Load to s3

In [69]:
pickle_data = pickle.dumps(dict_all_embed)
object_key = 'dict_all_embed.pkl'

client.put_object(
    bucket_name=BUCKET_NAME, 
    object_name=object_key, 
    data=io.BytesIO(pickle_data), 
    length=len(pickle_data), 
    content_type="application/octet-stream"
    )

<minio.helpers.ObjectWriteResult at 0x7f4d6258f2b0>

In [78]:
# Load dict with all embeds, {doc_name.pdf : np.array, size=cnt_chunk*embeds}
object_name = 'dict_all_embed.pkl'
client.fget_object(
    bucket_name=BUCKET_NAME, 
    object_name=object_name,
    file_path=f'/content/{object_name}'
    )

with open(f'/content/{object_name}', 'rb') as file:
    dict_all_embed2 = pickle.load(file)

In [79]:
((dict_all_embed2['2412_19437.pdf'] == dict_all_embed['2412_19437.pdf'])*1).min()

np.int64(1)