## RAG система



In [None]:
!pip install pandas faiss-cpu nltk pymorphy3 sentence-transformers transformers flask pyngrok

In [None]:
import os
import glob
import re
import difflib

import pandas as pd
import numpy as np
import faiss

import nltk
from nltk.corpus import stopwords
import pymorphy3

from sentence_transformers import SentenceTransformer
from transformers import pipeline

try:
    _ = stopwords.words('russian')
except LookupError:
    nltk.download('stopwords')

# загружаем данные
DATA_DIR = '.../data' # укажите путь к вашей папке data
PREFERRED = 'lx.xlsx'

xlsx_path = None
preferred_path = os.path.join(DATA_DIR, PREFERRED)
if os.path.isfile(preferred_path):
    xlsx_path = preferred_path
else:
    cands = sorted(glob.glob(os.path.join(DATA_DIR, '*.xlsx')))
    if cands:
        xlsx_path = cands[0]

assert xlsx_path is not None, 'Не найден ни один .xlsx в data/'

print('Используем файл:', xlsx_path)

df = pd.read_excel(xlsx_path)
assert {'question','content','category'}.issubset(df.columns), 'Ожидаются столбцы question, content, category'

questions = df['question'].fillna('').tolist()
contents = df['content'].fillna('').tolist()
categories = df['category'].fillna('прочее').tolist()

# криакие названия с расшифровкой 
abbreviations = {
    'лк': 'личный кабинет',
    'БиР': 'Беременность и роды',
    'зп': 'заработная плата',
    'НДФЛ': 'Налог на доходы физических лиц',
    'СТД': 'срочный трудовой договор',
    'ТК': 'трудовой договор',
    'АО': 'авансовый отчет',
    'SLA': 'сроки',
    'ЭЦП': 'электронная цифровая подпись',
    'КР': 'кадровый резерв',
}

# подготовка к нормализации
stop_words = set(stopwords.words('russian'))
morph = pymorphy3.MorphAnalyzer()


def collect_vocabulary(texts):
    vocab = set()
    for text in texts:
        for word in re.findall(r'\b\w+\b', str(text).lower()):
            parsed = morph.parse(word)[0]
            vocab.add(parsed.normal_form)
    return vocab

vocabulary = collect_vocabulary(questions + contents)

# предобработка текста 

def preprocess_text(text: str) -> str:
    text = str(text).lower()
    for abbr, desc in abbreviations.items():
        text = re.sub(r'\b' + re.escape(abbr) + r'\b', desc, text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    corrected = []
    for word in text.split():
        if word not in vocabulary:
            match = difflib.get_close_matches(word, vocabulary, n=1, cutoff=0.8)
            if match:
                word = match[0]
        normal = morph.parse(word)[0].normal_form
        if normal not in stop_words:
            corrected.append(normal)
    return ' '.join(set(corrected))

processed_questions = [preprocess_text(q) for q in questions]

# полготовка llm
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = embedder.encode(processed_questions, convert_to_numpy=True)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# генератор
generator = pipeline('text2text-generation', model='sberbank-ai/rugpt3small_based_on_gpt2')

print('Готово: загружены данные, построены эмбеддинги и индекс.')


In [None]:
from typing import Optional, List

def rag(query: str, top_k: int = 2, category_filter: Optional[str] = None, distance_threshold: float = 1.0) -> str:
    # преваритиельная обработка
    processed_query = preprocess_text(query)

    # фильтр категорий 
    if category_filter and category_filter != 'Все категории':
        filtered_indices = [i for i, c in enumerate(categories) if c == category_filter]
        if not filtered_indices:
            return 'Нет данных для указанной категории. Соединяю с оператором.'
        filtered_embeddings = embeddings[filtered_indices]
        filtered_contents = [contents[i] for i in filtered_indices]
    else:
        filtered_embeddings = embeddings
        filtered_contents = contents

    # поиск
    q_emb = embedder.encode([processed_query], convert_to_numpy=True)
    temp_index = faiss.IndexFlatL2(embeddings.shape[1])
    temp_index.add(filtered_embeddings)
    distances, idxs = temp_index.search(q_emb, min(top_k, len(filtered_embeddings)))

    if len(distances[0]) == 0 or distances[0][0] > distance_threshold:
        return 'Извините, я не нашел подходящий ответ. Соединяю с оператором.'

    retrieved = [filtered_contents[i] for i in idxs[0]]
    context = ' '.join(retrieved)
    prompt = f"Контекст: {context}\nВопрос: {query}\nОтвет:"

    out = generator(prompt, max_new_tokens=150, num_return_sequences=1)
    return out[0]['generated_text']

print('Функция rag() готова.')


In [None]:
# Пример запроса (-ов)
while True:
  query = input()
  if query != 'q':
    print(f'{query}: {rag(query)}')
  else:
    break


___________________

API

In [None]:
import os
import re
import difflib
import pandas as pd
import faiss
import nltk
from nltk.corpus import stopwords
import pymorphy3
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from flask import Flask, request, jsonify
from pyngrok import ngrok

In [None]:
# --- Настройка ngrok ---
NGROK_AUTH_TOKEN = "Token" # https://ngrok.com/
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# --- NLP подготовка ---
try:
    _ = stopwords.words('russian')
except LookupError:
    nltk.download('stopwords')

DATA_DIR = '.../data' # укажите путь к вашей папке data
PREFERRED = 'lx.xlsx'

xlsx_path = os.path.join(DATA_DIR, PREFERRED)
if not os.path.isfile(xlsx_path):
    raise Exception("Файл не найден")

df = pd.read_excel(xlsx_path)
questions = df['question'].fillna('').tolist()
contents = df['content'].fillna('').tolist()
categories = df['category'].fillna('прочее').tolist()

abbreviations = {
    'лк': 'личный кабинет',
    'БиР': 'Беременность и роды',
    'зп': 'заработная плата',
    'НДФЛ': 'Налог на доходы физических лиц',
    'СТД': 'срочный трудовой договор',
    'ТК': 'трудовой договор',
    'АО': 'авансовый отчет',
    'SLA': 'сроки',
    'ЭЦП': 'электронная цифровая подпись',
    'КР': 'кадровый резерв',
}

stop_words = set(stopwords.words('russian'))
morph = pymorphy3.MorphAnalyzer()

def collect_vocabulary(texts):
    vocab = set()
    for text in texts:
        for word in re.findall(r'\b\w+\b', str(text).lower()):
            parsed = morph.parse(word)[0]
            vocab.add(parsed.normal_form)
    return vocab

vocabulary = collect_vocabulary(questions + contents)

def preprocess_text(text: str) -> str:
    text = str(text).lower()
    for abbr, desc in abbreviations.items():
        text = re.sub(r'\b' + re.escape(abbr) + r'\b', desc, text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    corrected = []
    for word in text.split():
        if word not in vocabulary:
            match = difflib.get_close_matches(word, vocabulary, n=1, cutoff=0.8)
            if match:
                word = match[0]
        normal = morph.parse(word)[0].normal_form
        if normal not in stop_words:
            corrected.append(normal)
    return ' '.join(set(corrected))

processed_questions = [preprocess_text(q) for q in questions]

# --- FAISS индекс ---
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = embedder.encode(processed_questions, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# --- GPT генератор (Seq2Seq модель) ---
# Модель работает с text2text-generation
generator = pipeline("text-generation", model="ai-forever/rugpt3small_based_on_gpt2")

print('RAG готов к работе!')

# --- Flask API ---
app = Flask(__name__)

@app.route('/ask', methods=['POST'])
def ask():
    data = request.json
    question = data.get('question', '')

    # Поиск ближайшего документа
    proc_q = preprocess_text(question)
    q_emb = embedder.encode([proc_q], convert_to_numpy=True)
    D, I = index.search(q_emb, k=1)
    answer = contents[I[0][0]]

    # Проверка порога похожести
    threshold = 0.5
    if D[0][0] > threshold:
        answer = "Перевожу на оператора"
    else:
        answer = contents[I[0][0]]

    # Генерация ответа 
    gpt_answer = generator(answer, max_new_tokens=200)[0]['generated_text']
    return jsonify({'answer': gpt_answer})

    # Поиск ответа без генерации
    # return jsonify({'answer': answer})

# --- Запуск ngrok ---
public_url = ngrok.connect(5000)
print("Публичный URL для PyQt:", public_url)

# --- Запуск Flask ---
app.run(port=5000)