In [2]:
# Config
DB_CONN = "dbname=appdb user=appuser password=secret port=5432 host=rag-data"
EMB_MODEL_PATH = "/wrk/models/embedding_models/models--intfloat--multilingual-e5-large-instruct/snapshots/274baa43b0e13e37fafa6428dbc7938e62e5c439"
LLM_MODEL_PATH = "/wrk/models/llms/models--RefalMachine--RuadaptQwen2.5-7B-Lite-Beta-GGUF/snapshots/68ae9dff37a839f3441b9383519cffc4f7d829dd/FP16.gguf"
TOP_K = 7

In [36]:
from llama_cpp import Llama
print(hasattr(Llama, "GPU"))

False


In [31]:
# Models uploading
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from llama_cpp import Llama

torch.cuda.empty_cache()

emb_tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL_PATH)
emb_model = AutoModel.from_pretrained(EMB_MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
emb_model = emb_model.to(device)
emb_model.eval()


llm_model = Llama(
    model_path=LLM_MODEL_PATH,
    main_gpu=0,
    n_ctx=3000
    )

llama_model_loader: loaded meta data with 26 key-value pairs and 339 tensors from /wrk/models/llms/models--RefalMachine--RuadaptQwen2.5-7B-Lite-Beta-GGUF/snapshots/68ae9dff37a839f3441b9383519cffc4f7d829dd/FP16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = RuadaptQwen2.5 7B Lite Beta
llama_model_loader: - kv   3:                           general.finetune str              = Lite-Beta
llama_model_loader: - kv   4:                           general.basename str              = RuadaptQwen2.5
llama_model_loader: - kv   5:                         general.size_label str              = 7B
llama_model_loader: - kv   6:           

In [32]:
# Creating embeddings function
import torch.nn.functional as F

MAX_LENGTH = 512

# Ignoring useless tokens
def average_pool(last_hidden_states, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
    sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
    sum_mask = mask_expanded.sum(1).clamp(min=1e-9)
    return sum_embeddings/sum_mask

# Creating embeddings from text
def embed(text: str):
    inputs = emb_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH
        ).to(device)
    with torch.no_grad():
        outputs = emb_model(**inputs)
        emb = average_pool(outputs.last_hidden_state, inputs['attention_mask'])
        emb = F.normalize(emb, p=2, dim=1)
    return emb[0].cpu().numpy()

In [33]:
# Searching relevant documents
import psycopg2
import json

conn = psycopg2.connect(DB_CONN)
cur = conn.cursor()

def search_context(query, top_k=TOP_K):
    query_emb = embed(query).tolist()
    cur.execute(
        """
        SELECT content, metadata FROM documents_e5 ORDER BY embedding <-> %s LIMIT %s
        """,
        (json.dumps(query_emb), top_k)
    )
    results = cur.fetchall()
    return [r[0] for r in results]

In [None]:
search_context("Вопрос")

In [34]:
# Asking LLM
def ask_llm(question, context):
    prompt = f"""Ты - умный ассистент, помогающий сотрудникам ответить на вопросы. Используй приведённый контекст для ответа на вопросы.
    
    
    Контекст:
    {context}
    
    
    Вопрос: {question}
    Ответ:"""
    
    return llm_model(
        prompt,
        max_tokens=3500,
        temperature=0.7,
        top_p=0.9)['choices'][0]['text'].strip()

def answer_question(question: str):
    context_chunks = search_context(question)
    context = "\n\n".join(context_chunks)
    return ask_llm(question, context)

In [None]:
q = "Вопрос"
print(answer_question(q))