In [None]:
from typing import List

from langchain_core.documents import Document

In [None]:
file_path = '../../rzd_data/2Положение_о_молодом_специалисте_ОАО.pdf'

In [None]:
from langchain_community.document_loaders import PDFMinerLoader

loader = PDFMinerLoader(file_path)
docs = loader.load()
docs[0]

In [None]:
import re
from pypdf import PdfReader
from uuid import uuid4
from langchain_community.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader(file_path)

def extract_sections(pdf_path):
    loader = PDFMinerLoader(pdf_path)
    docs = loader.load()
    uid = str(uuid4())
    text = docs[0].page_content

    # Разделяем текст на верхнеуровневые разделы
    top_level_sections = re.split(r'\n(?=Раздел \d+\.)', text)

    # Словарь для хранения всех секций
    all_sections = []
    # Обрабатываем каждый верхнеуровневый раздел
    for top_section in top_level_sections:
        # Ищем название верхнеуровневого раздела
        top_match = re.match(r'(Раздел \d+\..+?)(?=\n\d+\.\d+\.|\Z)', top_section, re.DOTALL)
        if top_match:
            top_section_name = top_match.group(1).strip()

            # Разделяем текст на подсекции
            subsections = re.split(r'\n(?=\d+\.\d+\.)', top_section)

            # Обрабатываем каждую подсекцию
            for subsection in subsections[1:]:
                match = re.match(r'(\d+\.\d+\.)', subsection)
                if match:
                    section_number = match.group(1)
                    section_content = subsection[len(section_number):].strip()
                    all_sections.append({
                        'content': section_content,
                        'top_level_section': top_section_name,
                        'section_number': section_number,
                        'act_name': 'Коллективный договор',
                        'act_id': uid,
                    })

    return all_sections


sections = extract_sections(file_path)

docs = [Document(page_content=section['content'], metadata=section) for section in sections]

In [None]:
len(docs)

In [None]:
for doc in docs:
    print(f"Секция {doc.metadata['section_number']}")
    print(f"Верхнеуровневый раздел: {doc.metadata['top_level_section']}")
    print(f"Содержание: {doc.page_content[:100]}...")  # Выводим первые 100 символов содержимого
    print()

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_function = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")

In [None]:
from langchain_chroma import Chroma

vectorstore = Chroma(
    collection_name="rzd_base", embedding_function=embedding_function, persist_directory='./chroma_data'
)

In [None]:
vectorstore.add_documents(documents=docs)
retriever = vectorstore.as_retriever(search_kwargs={'k': 50})