Как создать свою БД:  
- Скачайте PostgreSQL и pgvector  
- С помощью команд ниже подключите расширение vector и создайте таблицу documents_e5

PostgreSQL: (в cmd)  
psql -h localhost -d appdb -U appuser  
CREATE EXTENSION IF NOT EXISTS vector;  
CREATE TABLE documents_e5 (id SERIAL PRIMARY KEY, doc_id TEXT, chunk_id INT, content TEXT, embedding vector(1024), metadata JSONB);  

Также создайте таблицу chat_history для будущего логирования запросов.  
CREATE TABLE chat_history (id SERIAL PRIMARY KEY, user_id TEXT, doc_id TEXT, user_message TEXT, rephrased_message TEXT, assistant_message TEXT, timestamp TIMESTAMP, sources_ids TEXT); 

In [None]:
# Config
FOLDER_PATH = "/wrk/data/raw/Документы" # Путь до папки с документами 
DB_CONN = "dbname=appdb user=appuser password=secret port=5432 host=rag-data" # Подключение к созданной БД
MODEL_PATH = "/wrk/models/embedding_models/models--intfloat--multilingual-e5-large-instruct/snapshots/274baa43b0e13e37fafa6428dbc7938e62e5c439" # Путь до вашей эмбеддинговой модели

In [3]:
# Model uploading
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, eleme

In [None]:
# Loading, parsing and creating embeddings functions
import pdfplumber
from docx import Document
import torch.nn.functional as F
import textract
import re

MAX_LENGTH = 512
CHUNK_SIZE = 1500 # Количество символов для одного чанка (кусочка документа)
OVERLAP = 200 # Количество символов для перекрытия чанков (чтобы модель видела контекст)

# Ignoring useless tokens
def average_pool(last_hidden_states, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
    sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
    sum_mask = mask_expanded.sum(1).clamp(min=1e-9)
    return sum_embeddings/sum_mask
    
# Creating embeddings from text
def embed(text: str):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH
        ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        emb = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        emb = F.normalize(emb, p=2, dim=1)
    return emb[0].cpu().numpy()

# Getting text from pdf
def load_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Getting text from docx
def load_docx(path):
    doc = Document(path)
    return "\n".join([row.text for row in doc.paragraphs if row.text.strip()])

# Getting text from doc
def load_doc(path):
    text = textract.process(path).decode("utf-8")
    return text.strip()

# Cleaning text
def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[-=*]{3,}', ' ', text)
    text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text)
    return text.strip()

# Chunking extracted text
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

In [5]:
# Saving data in PostgreSQL function
import psycopg2
import json

conn = psycopg2.connect(DB_CONN)
cur = conn.cursor()

def save_chunk(doc_id, chunk_id, text, metadata = {}):
    emb = embed(text)
    cur.execute(
        """
        INSERT INTO documents_e5 (doc_id, chunk_id, content, embedding, metadata) VALUES (%s, %s, %s, %s, %s)
        """,
        (doc_id, chunk_id, text, emb.tolist(), json.dumps(metadata))
    )
    conn.commit()

In [6]:
# Let's go!
import os
from datetime import datetime

def load_docs(folder_path):
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)
        ext = os.path.splitext(filename)[1].lower()
        
        if ext == ".pdf":
            text = load_pdf(path)
        elif ext == ".docx":
            text = load_docx(path)
        elif ext == ".doc":
            text = load_doc(path)
        else:
            print(f"Формат файла {filename} не поддерживается.")
            continue
        
        text = clean_text(text)
        
        chunks = chunk_text(text)
    
        for i, chunk in enumerate(chunks):
            save_chunk(
                doc_id=filename,
                chunk_id=i,
                text=chunk,
                metadata={"путь": path, "дата_последнего_изменения": datetime.fromtimestamp(os.path.getmtime(path)).strftime("%d.%m.%Y")}
            )
        
        print(f"{filename}: {len(chunks)} чанков сохранено")

In [None]:
load_docs(FOLDER_PATH)

После загрузки документов проиндексируйте чанки с помощью команды ниже.  
В случае, если вы будете добавлять новые документы, не забывайте сбрасывать индексы и создавать их заново (для индексации всех докуметов)  

CREATE INDEX ON documents_e5 USING ivfflat (embedding vector_cosine_ops);  
    --NOTICE:  ivfflat index created with little data  
    --DETAIL:  This will cause low recall.  
    --HINT:  Drop the index until the table has more data.  
    (DROP INDEX IF EXISTS name)