Как создать свою БД:  
- Скачайте PostgreSQL и pgvector  
- С помощью команд ниже подключите расширение vector и создайте таблицу documents

PostgreSQL: (в cmd)  
psql -h localhost -d appdb -U appuser  
CREATE EXTENSION IF NOT EXISTS vector;  
CREATE TABLE documents (id SERIAL PRIMARY KEY, doc_id TEXT, chunk_id INT, content TEXT, embedding vector(1024), metadata JSONB);  

Также создайте таблицу chat_history для будущего логирования запросов.  
CREATE TABLE chat_history (id SERIAL PRIMARY KEY, user_id TEXT, doc_id TEXT, user_message TEXT, rephrased_message TEXT, assistant_message TEXT, timestamp TIMESTAMP, sources_ids TEXT, chunks TEXT); 

In [None]:
# Config
FOLDER_PATH = "/wrk/data"
DB_CONN = "dbname=appdb user=appuser password=secret port=5433 host=10.101.10.106"
MODEL_PATH = "/wrk/models/models--intfloat--multilingual-e5-large-instruct/snapshots/274baa43b0e13e37fafa6428dbc7938e62e5c439"

In [2]:
# Model uploading
from transformers import AutoTokenizer, AutoModel
import torch

emb_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
emb_model = AutoModel.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emb_model = emb_model.to(device)
emb_model.eval()

  from .autonotebook import tqdm as notebook_tqdm


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, eleme

In [None]:
# Loading, parsing and creating embeddings functions
import torch.nn.functional as F
import re
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import granite_picture_description
from docling.chunking import HybridChunker
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer

# Text extractor config
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True, lang = ['rus'])
# pipeline_options.ocr_options = ocr_options
pipeline_options.do_formula_enrichment = True
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = granite_picture_description
pipeline_options.picture_description_options.prompt = ("Describe the picture.")
pipeline_options.images_scale = 1
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
        pipeline_options=pipeline_options)
    }
)

# Chunking config
MAX_LENGTH = 512
CHUNK_SIZE = 1500
OVERLAP = 200

# Ignoring useless tokens
def average_pool(last_hidden_states, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
    sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
    sum_mask = mask_expanded.sum(1).clamp(min=1e-9)
    return sum_embeddings/sum_mask
    
# Creating embeddings from text
def embed(text: str):
    inputs = emb_tokenizer(
        "passage: "+text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH
        ).to(device)
    with torch.no_grad():
        outputs = emb_model(**inputs)
        emb = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        emb = F.normalize(emb, p=2, dim=1)
    return emb[0].cpu().numpy()

# Getting text from pdf
def load_doc(path):
    doc = converter.convert(path)
    return doc.document

# Chunking extracted text
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

In [None]:
# Saving data in PostgreSQL function
import psycopg2
import json

conn = psycopg2.connect(DB_CONN)
cur = conn.cursor()

def save_chunk(doc_id, chunk_id, text, metadata = {}):
    emb = embed(text)
    cur.execute(
        """
        INSERT INTO documents (doc_id, chunk_id, content, embedding, metadata) VALUES (%s, %s, %s, %s, %s)
        """,
        (doc_id, chunk_id, text, emb.tolist(), json.dumps(metadata))
    )
    conn.commit()

In [None]:
# Let's go!
import os
from datetime import datetime

def load_docs(folder_path):
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)

        try:
            doc = load_doc(path)
        except:
            print(f"Формат файла {filename} не поддерживается.")
            continue
        
        chunks = chunk_text(doc.export_to_text())
        name = f"""Отрывок из документа: "{filename}"

"""
        path = "/wrk/data/База данных pdf/"+filename
        for i, chunk in enumerate(chunks):
            save_chunk(
                doc_id=filename,
                chunk_id=i,
                text=name+chunk,
                metadata={"путь": path, "дата_последнего_изменения": datetime.fromtimestamp(os.path.getmtime(path)).strftime("%d.%m.%Y")}
            )
        
        print(f"{filename}: {len(chunks)} чанков сохранено")

Если в папке лежат документы:

In [None]:
load_docs(FOLDER_PATH)

Если в папке есть другие папки с документами:

In [None]:
# for document in os.listdir(FOLDER_PATH):
#     main_path = os.path.join(FOLDER_PATH, document)
#     load_docs(main_path)#FOLDER_PATH

После загрузки документов проиндексируйте чанки с помощью команды ниже.  
В случае, если вы будете добавлять новые документы, не забывайте сбрасывать индексы и создавать их заново (для индексации всех докуметов)  

CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops);  
    --NOTICE:  ivfflat index created with little data  
    --DETAIL:  This will cause low recall.  
    --HINT:  Drop the index until the table has more data.  
    (DROP INDEX IF EXISTS name)

Затем необходимо добавить колонку и индексы для алгоритма BM25:  
ALTER TABLE documents ADD COLUMN tsv tsvector;  
UPDATE documents SET tsv = to_tsvector('russian', content);  
CREATE INDEX idx_documents_tsv ON documents USING gin(tsv);  

Теперь необходимо создать таблицы для будущего data layer приложения

CREATE TABLE users (
    "id" UUID PRIMARY KEY,
    "identifier" TEXT NOT NULL UNIQUE,
    "metadata" JSONB NOT NULL,
    "createdAt" TEXT
);

CREATE TABLE IF NOT EXISTS threads (
    "id" UUID PRIMARY KEY,
    "createdAt" TEXT,
    "name" TEXT,
    "userId" UUID,
    "userIdentifier" TEXT,
    "tags" TEXT[],
    "metadata" JSONB,
    FOREIGN KEY ("userId") REFERENCES users("id") ON DELETE CASCADE
);

CREATE TABLE IF NOT EXISTS steps (
    "id" UUID PRIMARY KEY,
    "name" TEXT NOT NULL,
    "type" TEXT NOT NULL,
    "threadId" UUID NOT NULL,
    "parentId" UUID,
    "streaming" BOOLEAN NOT NULL,
    "waitForAnswer" BOOLEAN,
    "isError" BOOLEAN,
    "metadata" JSONB,
    "tags" TEXT[],
    "input" TEXT,
    "output" TEXT,
    "createdAt" TEXT,
    "command" TEXT,
    "start" TEXT,
    "end" TEXT,
    "generation" JSONB,
    "showInput" TEXT,
    "language" TEXT,
    "indent" INT,
    "defaultOpen" BOOLEAN,
    FOREIGN KEY ("threadId") REFERENCES threads("id") ON DELETE CASCADE
);

CREATE TABLE IF NOT EXISTS elements (
    "id" UUID PRIMARY KEY,
    "threadId" UUID,
    "type" TEXT,
    "url" TEXT,
    "chainlitKey" TEXT,
    "name" TEXT NOT NULL,
    "display" TEXT,
    "objectKey" TEXT,
    "size" TEXT,
    "page" INT,
    "language" TEXT,
    "forId" UUID,
    "mime" TEXT,
    "props" JSONB,
    FOREIGN KEY ("threadId") REFERENCES threads("id") ON DELETE CASCADE
);

CREATE TABLE IF NOT EXISTS feedbacks (
    "id" UUID PRIMARY KEY,
    "forId" UUID NOT NULL,
    "threadId" UUID NOT NULL,
    "value" INT NOT NULL,
    "comment" TEXT,
    FOREIGN KEY ("threadId") REFERENCES threads("id") ON DELETE CASCADE
);

Ссылка на таблицу: postgresql+psycopg2://appuser:secret@10.101.10.106:5433/appdb