PostgreSQL:  
psql -h localhost -d appdb -U appuser  
CREATE EXTENSION IF NOT EXISTS vector;  
CREATE TABLE documents_ruBert (id SERIAL PRIMARY KEY, doc_id TEXT, chunk_id INT, content TEXT, embedding vector(1024), metadata JSONB);  
CREATE INDEX ON documents_ruBert USING ivfflat (embedding vector_cosine_ops);  
    --NOTICE:  ivfflat index created with little data  
    --DETAIL:  This will cause low recall.  
    --HINT:  Drop the index until the table has more data.  

In [None]:
# Config
FOLDER_PATH = "/wrk/data/raw/Рефераты"
DB_CONN = "dbname=appdb user=appuser password=secret port=5432 host=rag-data"
MODEL_PATH = "/wrk/models/embedding_models/models--ai-forever--ruBert-large/snapshots/efdc76b4678bc5c9a51642a4a5364371a89cea96"

In [2]:
# Model uploading
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(120138, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1,

In [3]:
# Loading, parsing and creating embeddings functions
import pdfplumber
from docx import Document
import torch.nn.functional as F
import textract

MAX_LENGTH = 512
CHUNK_SIZE = 1500
OVERLAP = 200

# Ignoring useless tokens
def average_pool(last_hidden_states, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
    sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
    sum_mask = mask_expanded.sum(1).clamp(min=1e-9)
    return sum_embeddings/sum_mask
    
# Creating embeddings from text
def embed(text: str):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH
        ).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        emb = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        emb = F.normalize(emb, p=2, dim=1)
    return emb[0].cpu().numpy()

# Getting text from pdf
def load_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Getting text from docx
def load_docx(path):
    doc = Document(path)
    return "\n".join([row.text for row in doc.paragraphs if row.text.strip()])

# Getting text from doc
def load_doc(path):
    text = textract.process(path).decode("utf-8")
    return text.strip()

# Chunking extracted text
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

In [4]:
# Saving data in PostgreSQL function
import psycopg2
import json

conn = psycopg2.connect(DB_CONN)
cur = conn.cursor()

def save_chunk(doc_id, chunk_id, text, metadata = {}):
    emb = embed(text)
    cur.execute(
        """
        INSERT INTO documents_ruBert (doc_id, chunk_id, content, embedding, metadata) VALUES (%s, %s, %s, %s, %s)
        """,
        (doc_id, chunk_id, text, emb.tolist(), json.dumps(metadata))
    )
    conn.commit()

In [5]:
# Let's go!
import os

def load_docs(folder_path):
    for filename in os.listdir(folder_path):
        path = os.path.join(folder_path, filename)
        ext = os.path.splitext(filename)[1].lower()
        
        if ext == ".pdf":
            text = load_pdf(path)
        elif ext == ".docx":
            text = load_docx(path)
        elif ext == ".doc":
            text = load_doc(path)
        else:
            print(f"Формат файла {filename} не поддерживается.")
            continue
    
        chunks = chunk_text(text)
    
        for i, chunk in enumerate(chunks):
            save_chunk(
                doc_id=filename,
                chunk_id=i,
                text=chunk,
                metadata={"path": path}
            )
        
        print(f"{filename}: {len(chunks)} чанков сохранено")

In [None]:
for folder in os.listdir(FOLDER_PATH):
    main_path = os.path.join(FOLDER_PATH, folder)
    load_docs(main_path)

    