In [15]:
! pip3 install requests




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import json
import os

In [None]:
MAX_CHUNK_SIZE = 3000

In [46]:
! pip install sentence-transformers nltk razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)
Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
from razdel import sentenize
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch


def get_sentence_embeddings(sentences: list[str], model_name: str = "FacebookAI/xlm-roberta-base") -> np.ndarray:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Среднее по токенам как эмбеддинг предложения
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return np.array(embeddings)


def semantic_chunking(text: str, max_chunk_size: int, similarity_threshold: float, context_window: int) -> list[str]:
    # Разделение текста на предложения
    sentences = [s.text.strip() for s in sentenize(text) if s.text.strip()]
    if not sentences:
        return []

    # Получение эмбеддингов
    embeddings = get_sentence_embeddings(sentences)

    # Инициализация чанков
    chunks = []
    current_chunk = [sentences[0]]
    current_size = len(sentences[0])

    # Группировка предложений по сходству
    for i in range(1, len(sentences)):
        # Определение контекстного окна
        start_idx = max(0, i - context_window)
        end_idx = min(len(sentences), i + context_window + 1)
        context_embeddings = embeddings[start_idx:end_idx]
        current_embedding = embeddings[i]

        # Вычисление косинусного сходства
        similarities = cosine_similarity([current_embedding], context_embeddings)[0]
        max_similarity = np.max(similarities) if similarities.size > 0 else 0

        # Проверка, можно ли добавить предложение в текущий чанк
        sentence_size = len(sentences[i])
        if current_size + sentence_size <= max_chunk_size and max_similarity >= similarity_threshold:
            current_chunk.append(sentences[i])
            current_size += sentence_size
        else:
            # Сохранение текущего чанка и начало нового
            chunks.append("\n\n".join(current_chunk))
            current_chunk = [sentences[i]]
            current_size = sentence_size

    # Добавление последнего чанка
    if current_chunk:
        chunks.append("\n\n".join(current_chunk))

    return chunks

In [21]:
import re


def process_text(text: str, source: str = "local_text") -> None:
    """Обрабатывает текст, разделяя его на семантические чанки, и сохраняет в JSON-файлы."""
    # Очистка текста от ненужных маркеров
    text = re.sub(r'Страница \d+ из \d+', '', text)  # Удаление номеров страниц
    text = re.sub(r'г\. [А-Яа-яЁё-]+, \d{4}', '', text)  # Удаление города и года
    text = text.strip()

    # Разделение на чанки
    chunks = semantic_chunking(text, MAX_CHUNK_SIZE, 0.80, 2)

    # Сохранение чанков в JSON
    os.makedirs('./chunks', exist_ok=True)
    for i, chunk in enumerate(chunks, 1):
        chunk_data = {
            "id": f"chunk_{i:02d}",
            "source": source,
            "content": chunk,
            "size": len(chunk)
        }
        with open(f"./chunks/chunk_{i:02d}.json", "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=2)

In [20]:
! pip3 install PyPDF2




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import PyPDF2
def read_text_from_pdf(pdf_file_path: str):
    if not os.path.exists(pdf_file_path) or not pdf_file_path.lower().endswith(".pdf"):
        return ""
    with open(pdf_file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''.join(page.extract_text() for page in reader.pages)
    return text

In [17]:
os.makedirs('./chunks', exist_ok=True)
text = read_text_from_pdf('monetaoffer.pdf')

In [31]:
with open('./res.txt', 'w', encoding="utf-8") as f:
    f.write(text)

In [22]:
process_text(text)