<a href="https://colab.research.google.com/github/DANIELVSHVL/ITMO_EXAM/blob/main/QA_BOT_ITMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # Установим необходимые библиотеки и зависимости

In [5]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [6]:
%pip install python-telegram-bot



In [7]:
!pip install -q playwright nest_asyncio
!playwright install --with-deps # Install browsers and dependencies
import nest_asyncio
nest_asyncio.apply()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling dependencies...
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,929 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main a

# ПАРСИНГ

In [8]:
import re
import time
import json
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlsplit, urlunsplit
from pathlib import Path

# --- Сохранение на Google Drive ---
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_DIR = Path("/content/drive/MyDrive/MyDrive_ITMO/BOT")
except Exception:
    # если не Colab — падаем на локальный каталог
    BASE_DIR = Path("data")

PDF_DIR = BASE_DIR / "files"
PDF_DIR.mkdir(parents=True, exist_ok=True)

FILE_EXTS = (".pdf", ".doc", ".docx", ".xls", ".xlsx", ".csv")

# --- HTTP ---
CONNECT_TIMEOUT, READ_TIMEOUT = 10, 20
HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_html(url):
    try:
        r = requests.get(url, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT), headers=HEADERS)
        r.raise_for_status()
        if not r.encoding:
            r.encoding = r.apparent_encoding or "utf-8"
        return r.text
    except Exception as e:
        print(f"[ERR] {url}: {e}")
        return ""

def download(url, path):
    try:
        r = requests.get(url, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT), headers=HEADERS)
        r.raise_for_status()
        with open(path, "wb") as f:
            f.write(r.content)
        return True
    except Exception as e:
        print(f"[DL-ERR] {url}: {e}")
        return False

# --- Нормализация URL (убираем ?query и #fragment, чтобы меньше дублей) ---
def normalize_url(base, href):
    absu = urljoin(base, (href or "").strip())
    parts = list(urlsplit(absu))
    parts[3] = ""  # query
    parts[4] = ""  # fragment
    return urlunsplit(parts)

# --- Текст из PDF ---
def pdf_to_lines(path: Path) -> list[str]:
    try:
        import fitz
    except ImportError:
        return []
    lines = []
    try:
        doc = fitz.open(path)
        for i, page in enumerate(doc, 1):
            text = page.get_text()
            for ln in text.splitlines():
                t = ln.strip()
                if 3 < len(t) < 240:
                    lines.append(f"[p.{i}] {t}")
    except Exception:
        return []
    return lines

# --- Текст из DOCX ---
def docx_to_lines(path: Path) -> list[str]:
    try:
        from docx import Document
    except ImportError:
        return []
    lines = []
    try:
        doc = Document(path)
        for p in doc.paragraphs:
            t = p.text.strip()
            if 3 < len(t) < 240:
                lines.append(t)
        for tbl in doc.tables:
            for row in tbl.rows:
                cells = [c.text.strip() for c in row.cells]
                if any(cells):
                    s = " | ".join(cells)
                    if 3 < len(s) < 240:
                        lines.append(s)
    except Exception:
        return []
    return lines

# --- Контакты (mailto/tel + из текста) ---
MAIL_RE  = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"\+?\d[\d\-\s()]{8,}\d")

def extract_contacts_from_soup(soup: BeautifulSoup, page_url: str):
    contacts = []
    # mailto / tel
    for a in soup.select('a[href^="mailto:"], a[href^="tel:"]'):
        href = a.get("href", "")
        label = a.get_text(" ", strip=True)
        if href.startswith("mailto:"):
            contacts.append({"type":"email","value":href.split(":",1)[1],"label":label,"url":page_url})
        elif href.startswith("tel:"):
            contacts.append({"type":"phone","value":href.split(":",1)[1],"label":label,"url":page_url})
    # из текста (на случай, если нет href)
    txt = soup.get_text(" ", strip=True)
    for m in MAIL_RE.finditer(txt):
        contacts.append({"type":"email","value":m.group(0),"label":"","url":page_url})
    for m in PHONE_RE.finditer(txt):
        raw = m.group(0)
        if len(re.sub(r"\D","",raw)) >= 9:
            contacts.append({"type":"phone","value":raw,"label":"","url":page_url})
    # дедуп по (type,value,url)
    uniq = {}
    for c in contacts:
        uniq[(c["type"], c["value"], c["url"])] = c
    return list(uniq.values())

# --- Сбор файлов с одной страницы + HTML текст ---
def find_resources_and_html(url, visited_files):
    soup = BeautifulSoup(get_html(url), "html.parser")
    html_text = []

    # Собираем текст со страницы (чуть шире: заголовки тоже полезны)
    for tag in soup.find_all(["h1", "h2", "h3", "p", "li", "div", "span", "td", "th"]):
        t = tag.get_text(" ", strip=True)
        if 3 < len(t) < 240:
            html_text.append(t)

    found_files = []
    for a in soup.find_all("a", href=True):
        href = normalize_url(url, a["href"].strip())
        if not any(href.lower().endswith(ext) for ext in FILE_EXTS):
            continue
        if href not in visited_files:
            visited_files.add(href)
            found_files.append({"url": href, "text": a.get_text(" ", strip=True) or ""})

    # Контакты
    contacts = extract_contacts_from_soup(soup, url)

    # Ссылки для обхода (как было, но нормализуем)
    links = [
        (a.get_text(" ", strip=True) or "", normalize_url(url, a["href"].strip()))
        for a in soup.find_all("a", href=True)
    ]

    return found_files, html_text, links, contacts

# --- Рекурсивный сбор до глубины ---
def crawl(url, depth=1):
    visited_files = set()
    seen_pages = set()
    all_files = []
    all_html_text = []
    all_contacts = []
    to_visit = [(url, 0)]

    while to_visit:
        cur_url, d = to_visit.pop(0)
        if cur_url in seen_pages:
            continue
        seen_pages.add(cur_url)

        files, html_text, links_html, contacts = find_resources_and_html(cur_url, visited_files)
        all_files.extend(files)
        all_html_text.extend(html_text)
        all_contacts.extend(contacts)

        if d < depth:
            for _, href in links_html:
                if urlparse(href).netloc == urlparse(url).netloc and href not in seen_pages:
                    if not any(href.lower().endswith(ext) for ext in FILE_EXTS):
                        to_visit.append((href, d + 1))
        time.sleep(0.5)
    return all_files, all_html_text, all_contacts

# --- Скачка и извлечение текста ---
def grab_all_files(files, prefix):
    texts = []
    saved = []
    for i, f in enumerate(files, 1):
        ext = Path(urlparse(f["url"]).path).suffix.lower()
        path = PDF_DIR / f"{prefix}_{i}{ext or '.bin'}"
        if download(f["url"], path):
            saved.append({"file": path.name, "url": f["url"]})
            if ext == ".pdf":
                texts.extend(pdf_to_lines(path))
            elif ext == ".docx":
                texts.extend(docx_to_lines(path))
    return texts, saved

# --- Главная функция ---
def parse_program(url):
    all_files, html_texts, contacts = crawl(url, depth=2)
    file_texts, saved_files = grab_all_files(all_files, "file")
    result = {
        "HTML_TEXT": html_texts,
        "FILE_TEXT": file_texts,
        "FILES": saved_files,
        "CONTACTS": contacts
    }
    return result

# --- Хелперы сохранения ---
def save_json(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2))

def save_contacts_csv(contacts, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f, delimiter=";")
        w.writerow(["type","value","label","url"])
        for c in contacts:
            w.writerow([c.get("type",""), c.get("value",""), c.get("label",""), c.get("url","")])

# --- Пример ---
if __name__ == "__main__":
    urls = [
        "https://abit.itmo.ru/program/master/ai",
        "https://abit.itmo.ru/program/master/ai_product"
    ]

    all_contacts = []
    for u in urls:
        data = parse_program(u)

        print(f"=== {u} ===")
        print(f"HTML_TEXT lines: {len(data['HTML_TEXT'])}")
        print(f"FILE_TEXT lines: {len(data['FILE_TEXT'])}")
        print(f"FILES: {len(data['FILES'])}")
        print(f"CONTACTS: {len(data['CONTACTS'])}")

        # сохраняем JSON по каждой программе
        fname = urlparse(u).path.strip("/").replace("/", "_") + ".json"
        out_json = BASE_DIR / fname
        save_json(data, out_json)
        print("Saved JSON:", out_json)

        all_contacts.extend(data["CONTACTS"])

    # общий CSV контактов
    contacts_csv = BASE_DIR / "contacts.csv"
    save_contacts_csv(all_contacts, contacts_csv)
    print("Contacts CSV:", contacts_csv)

    print("Files saved to:", PDF_DIR.resolve())


=== https://abit.itmo.ru/program/master/ai ===
HTML_TEXT lines: 3103
FILE_TEXT lines: 0
FILES: 12
CONTACTS: 134
Saved JSON: data/program_master_ai.json
=== https://abit.itmo.ru/program/master/ai_product ===
HTML_TEXT lines: 2180
FILE_TEXT lines: 0
FILES: 8
CONTACTS: 114
Saved JSON: data/program_master_ai_product.json
Contacts CSV: data/contacts.csv
Files saved to: /content/data/files


# Сохраняем в JSON

In [9]:
# --- Сбор корпуса по программам ---
def build_corpus(program_name, parsed_data):
    """
    Объединяет HTML-текст и текст из файлов в единый корпус для программы.
    """
    corpus = []

    # HTML-текст
    for line in parsed_data.get("HTML_TEXT", []):
        corpus.append(line)

    # Текст из файлов (если был извлечён)
    for line in parsed_data.get("FILE_TEXT", []):
        corpus.append(line)

    return {
        "program": program_name,
        "text": corpus,
        "files": parsed_data.get("FILES", [])
    }

# --- Пример: используем данные после парсинга ---
programs_data = {}

# Парсинг двух программ
data_ai = parse_program("https://abit.itmo.ru/program/master/ai")
programs_data["ai"] = build_corpus("Artificial Intelligence", data_ai)

data_ai_prod = parse_program("https://abit.itmo.ru/program/master/ai_product")
programs_data["ai_product"] = build_corpus("AI Product Management", data_ai_prod)

# Теперь programs_data содержит корпуса по обеим программам
print(f"Корпус AI: {len(programs_data['ai']['text'])} строк")
print(f"Корпус AI Product: {len(programs_data['ai_product']['text'])} строк")

# --- Сохраняем в JSON ---
import json
with open("programs_corpus.json", "w", encoding="utf-8") as f:
    json.dump(programs_data, f, ensure_ascii=False, indent=2)

print("Сохранено в programs_corpus.json")

Корпус AI: 3103 строк
Корпус AI Product: 2180 строк
Сохранено в programs_corpus.json


In [None]:
# === Склейка готовых дампов в один programs_corpus.json (без перепарсинга) ===
# Читает:  /content/drive/MyDrive/MyDrive_ITMO/BOT/program_master_ai.json
#          /content/drive/MyDrive/MyDrive_ITMO/BOT/program_master_ai_product.json
# Пишет:   ./programs_corpus.json  и копию в /content/drive/MyDrive/MyDrive_ITMO/BOT/programs_corpus.json

import json
from pathlib import Path

DRIVE_DIR = Path("/content/drive/MyDrive/MyDrive_ITMO/BOT")
SRC_AI = DRIVE_DIR / "program_master_ai.json"
SRC_PM = DRIVE_DIR / "program_master_ai_product.json"
OUT_LOCAL = Path("programs_corpus.json")
OUT_DRIVE = DRIVE_DIR / "programs_corpus.json"

def _load_json(p: Path):
    assert p.exists(), f"Нет файла: {p}"
    return json.loads(p.read_text(encoding="utf-8"))

def _dedup_keep_order(seq):
    seen = set(); out = []
    for x in seq or []:
        if isinstance(x, str):
            s = x.strip()
            if s and s not in seen:
                seen.add(s); out.append(s)
    return out

def _entry(program_name: str, src: dict):
    return {
        "program": program_name,
        "file_text": _dedup_keep_order(src.get("FILE_TEXT", [])),
        "html_text": _dedup_keep_order(src.get("HTML_TEXT", [])),
        "files": src.get("FILES", []),
        "contacts": src.get("CONTACTS", []),
    }

ai       = _load_json(SRC_AI)
ai_prod  = _load_json(SRC_PM)

corpus = {
    "ai": _entry("Artificial Intelligence", ai),
    "ai_product": _entry("AI Product Management", ai_prod),
}

OUT_LOCAL.write_text(json.dumps(corpus, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved local:", OUT_LOCAL.resolve())

OUT_DRIVE.write_text(json.dumps(corpus, ensure_ascii=False, indent=2), encoding="utf-8")
print("Saved drive:", OUT_DRIVE)

# Краткий отчёт
print("\n[report]")
for k in ("ai", "ai_product"):
    e = corpus[k]
    print(f"{k}: file_text={len(e['file_text'])} | html_text={len(e['html_text'])} | files={len(e['files'])} | contacts={len(e['contacts'])}")


# Быстрый индекс с кешем

In [None]:
!pip -q install langchain langchain-community langchain-huggingface faiss-cpu
!pip -q install langchain langchain-community langchain-huggingface faiss-cpu python-telegram-bot requests nest_asyncio


In [None]:
# === fast_index_cache_build.py (Drive-persistent + fixed cache check) ===
import os, json, time, torch, warnings
from pathlib import Path
from typing import List
from tqdm.auto import tqdm

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# тише, HF
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
warnings.filterwarnings("ignore", category=UserWarning, module="huggingface_hub.utils._auth")

# источник корпуса
SRC = Path("programs_corpus.json")
assert SRC.exists(), "Нет programs_corpus.json (собери склейкой перед этим шагом)."

# ——— кэш индексов на Google Drive (если доступен), иначе локально ———
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    CACHE_DIR = Path("/content/drive/MyDrive/MyDrive_ITMO/BOT/index_cache")
except Exception:
    CACHE_DIR = Path("index_cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

IDX_FILE_ONLY = CACHE_DIR / "faiss_file_only"   # индекс только file_text
IDX_ALL       = CACHE_DIR / "faiss_all"         # индекс file_text + html_text

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[i] device: {device}")

emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device},
)

data = json.loads(SRC.read_text(encoding="utf-8"))

def make_docs(entry: dict, key: str) -> List[Document]:
    arr = entry.get(key) or []
    docs: List[Document] = []
    for s in arr:
        if isinstance(s, str):
            s = s.strip()
            if s:
                docs.append(Document(page_content=s, metadata={"program": entry.get("program"), "src": key}))
    return docs

# крупнее чанки → меньше эмбеддингов (быстрее индекс)
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)

def embed_with_progress(texts: List[str], batch_size: int = 128) -> List[List[float]]:
    vectors: List[List[float]] = []
    total = len(texts)
    if total == 0:
        return vectors
    pbar = tqdm(total=total, desc="Embedding", unit="chunk")
    t0 = time.time()
    for i in range(0, total, batch_size):
        batch = texts[i:i+batch_size]
        vecs = emb.embed_documents(batch)
        vectors.extend(vecs)
        pbar.update(len(batch))
        done = i + len(batch)
        elapsed = time.time() - t0
        rate = done / max(elapsed, 1e-6)
        remain = (total - done) / max(rate, 1e-6)
        pbar.set_postfix_str(f"rate≈{rate:.1f}/s eta≈{remain:,.0f}s")
    pbar.close()
    return vectors

# ——— правильная проверка наличия кэша FAISS ———
def _faiss_cache_ok(path: Path) -> bool:
    return (path / "index.pkl").exists() and (path / "index.faiss").exists()

def build_or_load(index_path: Path, docs: List[Document], title: str) -> FAISS:
    """Если кэш есть — грузим, иначе строим и сохраняем."""
    index_path.mkdir(parents=True, exist_ok=True)
    if _faiss_cache_ok(index_path):
        print(f"[OK] load cache → {index_path}")
        return FAISS.load_local(str(index_path), emb, allow_dangerous_deserialization=True)

    print(f"\n[i] build index → {index_path}  ({title})")
    t0 = time.time()

    print("[1/3] split documents …")
    chunks = splitter.split_documents(docs)
    print(f"     chunks: {len(chunks)}")

    print("[2/3] compute embeddings …")
    texts = [d.page_content for d in chunks]
    metas = [d.metadata     for d in chunks]
    vectors = embed_with_progress(texts, batch_size=128)

    print("[3/3] build FAISS …")
    # новая сигнатура: передаём пары (text, embedding)
    text_embeddings = list(zip(texts, vectors))
    vs = FAISS.from_embeddings(text_embeddings=text_embeddings,
                               embedding=emb,
                               metadatas=metas)
    vs.save_local(str(index_path))

    dt = time.time() - t0
    print(f"[OK] built in {dt:.1f}s | chunks={len(chunks)} | saved → {index_path}")
    return vs

# ---------- сбор исходных доков ----------
docs_file: List[Document] = []
for key in ("ai", "ai_product"):
    docs_file += make_docs(data[key], "file_text")

docs_all: List[Document] = []
for key in ("ai", "ai_product"):
    docs_all += make_docs(data[key], "file_text") + make_docs(data[key], "html_text")

print(f"[i] docs (file_text): {len(docs_file)} | docs (all): {len(docs_all)}")

# ---------- построение/загрузка индексов ----------
vs_file = build_or_load(IDX_FILE_ONLY, docs_file, title="file_text only (PDF/DOCX приоритет)")
vs_all  = build_or_load(IDX_ALL,       docs_all,  title="file_text + html_text")

print(f"\n[READY] cache dir: {CACHE_DIR.resolve()}")
for p in sorted(CACHE_DIR.glob("*")):
    try:
        size = sum(f.stat().st_size for f in p.glob("**/*") if f.is_file())
        print(" -", p.name, size//1024, "KB")
    except Exception:
        print(" -", p.name)


# TG‑бот на кеше (file‑first retriever)

In [None]:
# ====== ACCESS BLOCK: вводим ТОЛЬКО Authorization key (Сбер) ======
import os, re, base64, binascii, unicodedata, requests, urllib3
from getpass import getpass
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

SCOPE = "GIGACHAT_API_PERS"

# !!! ЗАМЕНИ на НОВЫЙ токен после перевыпуска у @BotFather !!!
TG_TOKEN_RAW = "7858029775:AAG2qpyCW--2ZeBto3Hzy7KLwX70RPWUemU"

def _mask(s, h=6, t=4):
    s = str(s)
    return (s[:h] + "…" + s[-t:]) if len(s) > h+t else s

raw_inp = getpass("Вставь Authorization key (base64, можно с 'Basic'): ").strip()
raw_inp = unicodedata.normalize("NFKC", raw_inp).replace("\u200b","").replace("\u2060","")

# Явно отлавливаем, если ты вдруг вставил TG-токен (формат: цифры:буквы_дефисы)
if re.fullmatch(r"\d+:[A-Za-z0-9_\-]{30,}", raw_inp):
    raise SystemExit("[ERR] Это Telegram Bot Token, а не ключ Сбера. Вставь Authorization key из кабинета (base64(client_id:client_secret)).")

m = re.search(r'(?i)basic\s+([A-Za-z0-9+/=\s]+)$', raw_inp)
auth_b64 = m.group(1) if m else raw_inp
auth_b64 = re.sub(r'[\r\n\t "\']+', '', auth_b64)
auth_b64 = re.sub(r'[^A-Za-z0-9+/=]', '', auth_b64)

# валидация: внутри должен быть "client_id:client_secret"
try:
    decoded = base64.b64decode(auth_b64, validate=True).decode("utf-8", "ignore")
    assert ":" in decoded and all(decoded.split(":",1)), "decoded not client_id:client_secret"
except Exception as e:
    raise SystemExit(f"[ERR] Authorization key повреждён: {e}. Скопируй строку из поля 'Authorization key' в кабинете Сбера.")

AUTH_BASIC = f"Basic {auth_b64}"

# 2) Кладём в ENV
tg = unicodedata.normalize("NFKC", TG_TOKEN_RAW).strip().strip("'\"`").replace("\u200b","").replace("\u2060","")
os.environ["TELEGRAM_BOT_TOKEN"] = tg
os.environ["GIGA_AUTH_BASIC"] = AUTH_BASIC

# 3) Мини-тест Telegram
r = requests.get(f"https://api.telegram.org/bot{tg}/getMe", timeout=15)
print("TG getMe:", r.status_code, ("OK" if r.ok else r.text))
print("TG token head/tail:", _mask(tg))

# 4) Мини-тест OAuth (Сбер)
resp = requests.post(
    "https://ngw.devices.sberbank.ru:9443/api/v2/oauth",
    headers={
        "Authorization": AUTH_BASIC,
        "Content-Type": "application/x-www-form-urlencoded",
        "Accept": "application/json",
        "RqUID": "00000000-0000-0000-0000-000000000001",
    },
    data={"scope": SCOPE},
    timeout=30, verify=False
)
print("OAuth preflight:", resp.status_code, resp.text[:160])
if resp.status_code != 200:
    raise SystemExit(f"[ERR] OAuth {resp.status_code}: {resp.text}")

print("AUTH ok:", "Basic", _mask(auth_b64), "| client_id:", _mask(decoded.split(':',1)[0]))
print("Доступы готовы. Ниже запускай блок с ботом.")


In [None]:
import os, re, json, csv
from pathlib import Path

# ---------- ПУТИ ----------
GDRIVE_DIR = Path("/content/drive/MyDrive/MyDrive_ITMO/BOT")
RAW_CANDIDATES = [
    GDRIVE_DIR / "contacts.csv",
    Path("/mnt/data/contacts.csv"),
    Path("contacts.csv"),
]
OUT_JSON     = GDRIVE_DIR / "contacts.json"           # сюда бот смотрит
BACKUP_JSON  = GDRIVE_DIR / "contacts_raw_backup.json"  # полный нормализованный дамп (для проверки)

GDRIVE_DIR.mkdir(parents=True, exist_ok=True)

# ---------- ПАРАМЕТРЫ ----------
TOP_K_PER_PROGRAM = 4         # сколько контактов на программу оставить
KEEP_ONLY_MANAGERISH = True   # стараться оставлять менеджеров/координаторов

# ---------- HELPERS ----------
def _norm_text(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"[\u200b\u2060]", "", s)     # zero-width
    s = re.sub(r"\s+", " ", s)
    return s

def _norm_phone(s: str) -> str:
    s = (s or "").strip()
    digits = re.sub(r"\D", "", s)
    if not digits:
        return ""
    if digits.startswith("8"):
        digits = "7" + digits[1:]
    if digits.startswith("7") and len(digits) == 11:
        return f"+7 ({digits[1:4]}) {digits[4:7]}-{digits[7:9]}-{digits[9:11]}"
    return s

# ключевые слова
AI_PAT  = re.compile(r"(искусственн|ИИ\b|AI\b|artificial|интеллект|машинн|ML\b)", re.I)
PM_PAT  = re.compile(r"(управлени[ея]\s*ИИ|ИИ[-\s]?продукт|AI[-\s]?product|product\s+management|продакт|продуктовый менедж)", re.I)
URL_AI  = re.compile(r"program/master/ai\b", re.I)
URL_PM  = re.compile(r"program/master/ai_product\b", re.I)

# хорошие/плохие роли
ROLE_GOOD = re.compile(r"(менедж|координатор|куратор|руководител[ья]|администратор программ|program\s*manager|coordinator)", re.I)
ROLE_BAD  = re.compile(r"(при[её]мн|call[-\s]?center|общий|горячая линия|help|поддержк|пресс|канцеляр|секретариат|media|smm)", re.I)

# сопоставление колонок CSV твоим названиям
COLMAPS = {
    "name":  ["name","fio","ФИО","title","имя"],
    "role":  ["role","position","должность","роль"],
    "email": ["email","почта","e-mail","mail"],
    "phone": ["phone","телефон","tel","номер"],
    "page":  ["page","url","ссылка","link","site"],
    "program":["program","программа","prog"],
}

def _pick(row: dict, keys) -> str:
    for k in keys:
        if k in row and str(row[k]).strip():
            return str(row[k])
    return ""

def _load_raw_csv():
    src = None
    for p in RAW_CANDIDATES:
        if p.exists():
            src = p
            break
    if not src:
        raise SystemExit("contacts.csv не найден")
    # читаем надёжно, даже если порядок/названия столбцов странные
    with src.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        rows = [ {k.strip(): (v or "").strip() for k,v in r.items()} for r in reader ]
    print(f"[OK] raw contacts loaded: {len(rows)} rows from {src}")
    return rows

def guess_program(blob: str, url: str) -> str:
    if URL_AI.search(url): return "ai"
    if URL_PM.search(url): return "ai_product"
    ai = bool(AI_PAT.search(blob))
    pm = bool(PM_PAT.search(blob))
    if ai and not pm: return "ai"
    if pm and not ai: return "ai_product"
    return "generic"

def is_managerish(role: str) -> bool:
    return bool(ROLE_GOOD.search(role or "") or "manager" in (role or "").lower())

def score_contact(rec: dict) -> int:
    score = 0
    role = rec.get("role","")
    email = (rec.get("email") or "").lower()
    url   = (rec.get("page") or "").lower()
    prog  = rec.get("program") or "generic"

    if ROLE_GOOD.search(role): score += 3
    if "itmo.ru" in email: score += 1
    if rec.get("phone"): score += 1
    if ROLE_BAD.search(role): score -= 2
    if prog == "ai" and URL_AI.search(url): score += 3
    if prog == "ai_product" and URL_PM.search(url): score += 3
    if prog == "generic": score -= 1
    return score

# ---------- ЧТЕНИЕ И НОРМАЛИЗАЦИЯ ----------
raw_rows = _load_raw_csv()
norm_rows = []
for row in raw_rows:
    r = {k: _pick(row, ks) for k, ks in COLMAPS.items()}
    r = {k: _norm_text(v) for k, v in r.items()}
    r["phone"] = _norm_phone(r["phone"])
    blob = " ".join(r.values())
    r["program"] = r.get("program") or guess_program(blob, r.get("page",""))
    # выкидываем явно мусорные записи (ни почты, ни телефона)
    if not r.get("email") and not r.get("phone"):
        continue
    # фильтруем пресс-службы/общие линии
    if ROLE_BAD.search(r.get("role","")):
        continue
    norm_rows.append(r)

# ---------- ДЕДУП ПО email/phone/name+role С ВЫБОРОМ ЛУЧШЕГО СКОРА ----------
bykey = {}
for r in norm_rows:
    key = (r.get("email","").lower(), r.get("phone",""), r.get("name","").lower(), r.get("role","").lower())
    s = score_contact(r)
    old = bykey.get(key)
    if (not old) or (s > old["score"]):
        r["score"] = s
        bykey[key] = r
clean = list(bykey.values())

# ---------- РАЗБИВКА ПО ПРОГРАММАМ И TOP-K ----------
def topk(pool, k):
    if not pool: return []
    pool = sorted(pool, key=lambda x: x.get("score",0), reverse=True)
    if KEEP_ONLY_MANAGERISH:
        mgr = [r for r in pool if is_managerish(r.get("role",""))]
        if len(mgr) >= k:
            return mgr[:k]
        need = k - len(mgr)
        rest = [r for r in pool if r not in mgr]
        return mgr + rest[:need]
    return pool[:k]

ai_pool      = [r for r in clean if r["program"] == "ai"]
pm_pool      = [r for r in clean if r["program"] == "ai_product"]
generic_pool = [r for r in clean if r["program"] == "generic"]

final_list = topk(ai_pool, TOP_K_PER_PROGRAM) + topk(pm_pool, TOP_K_PER_PROGRAM)
if len(final_list) < TOP_K_PER_PROGRAM:  # добираем generic если пусто
    need = TOP_K_PER_PROGRAM - len(final_list)
    final_list += topk(generic_pool, need)

# ---------- РУЧНЫЕ ФИКСЫ (опционально) ----------
MANUAL_OVERRIDES = [
      {
       "name": "Елизавета Василенко",
       "role": "Менеджер программы",
       "email": "aitalents@itmo.ru",
       "phone": "+7 (993) 639-86-77",
       "page": "https://abit.itmo.ru/program/master/ai",
       "program": "ai"
     },
      {
       "name": "Регина Ильдаровна Абдрашитова",
       "role": "Менеджер программы",
       "email": "aiproduct@itmo.ru",
       "phone": "+7 (999) 526-79-88",
       "page": "https://abit.itmo.ru/program/master/ai_product",
       "program": "ai_product"
      },
]
# применяем ручные фиксы (замещают по email, если совпадение найдено)
email_index = { (r.get("email") or "").lower() for r in final_list }
for r in MANUAL_OVERRIDES:
    e = (r.get("email") or "").lower()
    r = {**r}  # копия
    r["name"]  = _norm_text(r.get("name",""))
    r["role"]  = _norm_text(r.get("role",""))
    r["phone"] = _norm_phone(r.get("phone",""))
    r["page"]  = _norm_text(r.get("page",""))
    r["program"] = (r.get("program") or "generic").strip()
    if e and e in email_index:
        final_list = [r if (x.get("email","").lower()==e) else x for x in final_list]
    else:
        final_list.append(r)

# ---------- SAVE ----------
# Бэкап всего нормализованного набора (для ревью)
BACKUP_JSON.write_text(json.dumps(clean, ensure_ascii=False, indent=2), encoding="utf-8")
# Итог, который читает бот
OUT_JSON.write_text(json.dumps(final_list, ensure_ascii=False, indent=2), encoding="utf-8")

# ---------- LOG ----------
print(f"[OK] normalized: {len(norm_rows)} → dedup: {len(clean)} → final: {len(final_list)}")
for r in final_list:
    print(" -", (r.get("program") or "?"), "|", r.get("name","?"), "|", r.get("role","?"),
          "|", r.get("email",""), "|", r.get("phone",""))

print("\nSaved for bot  →", OUT_JSON)
print("Backup (all normalized) →", BACKUP_JSON)


In [None]:
# ===== ITMO RAG bot — full single cell (tokens already in ENV) =====
!pip -q install langchain langchain-community langchain-huggingface faiss-cpu python-telegram-bot requests nest_asyncio

import os, re, json, time, warnings, requests, nest_asyncio, asyncio, urllib3
from pathlib import Path
from typing import Optional, List, Dict

from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms.base import LLM

from telegram import Update
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters

# ---- quiet noisy libs
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="huggingface_hub.utils._auth")
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# ---- tokens from ENV
TG_TOKEN   = os.environ["TELEGRAM_BOT_TOKEN"].strip()
AUTH_BASIC = os.environ["GIGA_AUTH_BASIC"].strip()
assert TG_TOKEN and AUTH_BASIC.startswith("Basic "), "ENV tokens missing"

# ---- FAISS indexes (built earlier)
def _cache_ok(p: Path) -> bool:
    return (p / "index.pkl").exists() and (p / "index.faiss").exists()

CACHE_DIR = Path("index_cache")
if not (_cache_ok(CACHE_DIR / "faiss_file_only") and _cache_ok(CACHE_DIR / "faiss_all")):
    alt = Path("/content/drive/MyDrive/MyDrive_ITMO/BOT/index_cache")
    if _cache_ok(alt / "faiss_file_only") and _cache_ok(alt / "faiss_all"):
        CACHE_DIR = alt
    else:
        raise SystemExit("FAISS cache not found. Build indexes first.")

IDX_FILE_ONLY = CACHE_DIR / "faiss_file_only"
IDX_ALL       = CACHE_DIR / "faiss_all"

# ---- embeddings (same as for build)
try:
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
except Exception:
    device = "cpu"

emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device},
)

# ---- load indexes
vs_file = FAISS.load_local(str(IDX_FILE_ONLY), emb, allow_dangerous_deserialization=True)
vs_all  = FAISS.load_local(str(IDX_ALL),       emb, allow_dangerous_deserialization=True)
print(f"[OK] indexes loaded from {CACHE_DIR.resolve()} | device={device}")

# =========================================================
#                 Contacts: load contacts.json
# =========================================================
CONTACTS_PATHS = [
    Path("/content/drive/MyDrive/MyDrive_ITMO/BOT/contacts.json"),
    Path("/mnt/data/contacts.json"),
    Path("contacts.json"),
]
CONTACTS: List[Dict] = []
for p in CONTACTS_PATHS:
    if p.exists():
        try:
            CONTACTS = json.loads(p.read_text(encoding="utf-8"))
            print(f"[OK] contacts loaded: {len(CONTACTS)} from {p}")
            break
        except Exception as e:
            print("[WARN] failed to load contacts:", e)
if not CONTACTS:
    print("[WARN] contacts.json not found — контакты отвечать не смогу")

# ---- contact intent + routing
RE_CONTACT_INTENT = re.compile(r"(контакт|телефон|почт|email|e-mail|e mail|менедж|координатор|manager)", re.I)
RE_PROD = re.compile(r"(управлени|продукт|продакт|ai[-\s]?product|product\s*management)", re.I)
RE_AI   = re.compile(r"(искусствен|интеллект|нейросет|машинн|AI\b|ML\b)", re.I)

def detect_program(text: str) -> List[str]:
    text = text or ""
    prog = []
    if RE_PROD.search(text): prog.append("ai_product")
    if RE_AI.search(text):   prog.append("ai")
    # если не смогли однозначно — покажем обе программы
    return list(dict.fromkeys(prog)) or ["ai","ai_product"]

def try_contacts_answer(user_text: str) -> Optional[str]:
    """Если похоже на запрос контактов — вернуть готовый ответ (str), иначе None."""
    if not CONTACTS or not RE_CONTACT_INTENT.search(user_text or ""):
        return None

    progs = set(detect_program(user_text))
    pool = [c for c in CONTACTS if (c.get("program") in progs)]
    if not pool:
        pool = CONTACTS  # на всякий случай

    # приоритизация менеджеров/координаторов и полноты данных
    def _score(c):
        role = c.get("role") or ""
        s = 0
        if re.search(r"(менедж|координатор|куратор|руководител|program\s*manager|coordinat)", role, re.I): s += 5
        if c.get("phone"): s += 2
        if c.get("email"): s += 1
        if c.get("program") in progs: s += 1
        return -s

    pool = sorted(pool, key=_score)[:6]
    if not pool:
        return "Пока не вижу валидных контактов в справочнике. Убедись, что contacts.json сформирован и подхватывается."

    head = "Контакты по программе{}:\n".format(
        "м " + ", ".join({"ИИ‑продукт" if p=="ai_product" else "Искусственный интеллект" for p in progs})
    )
    lines = []
    for c in pool:
        n  = c.get("name")  or "—"
        r  = c.get("role")  or "—"
        ph = c.get("phone") or "—"
        em = c.get("email") or "—"
        pg = "ИИ‑продукт" if c.get("program")=="ai_product" else ("Искусственный интеллект" if c.get("program")=="ai" else "—")
        lines.append(f"• [{pg}] {n} — {r}\n   тел.: {ph} | email: {em}")
    return head + "\n".join(lines)

# =========================================================
#                  GigaChat LLM wrapper
# =========================================================
_ACCESS_TOKEN, _TS = None, 0.0
def get_access_token(force: bool = False) -> str:
    """OAuth with in-memory caching (30 min)."""
    global _ACCESS_TOKEN, _TS
    if _ACCESS_TOKEN and not force and time.time() - _TS < 1800:
        return _ACCESS_TOKEN
    r = requests.post(
        "https://ngw.devices.sberbank.ru:9443/api/v2/oauth",
        headers={
            "Authorization": AUTH_BASIC,
            "Content-Type": "application/x-www-form-urlencoded",
            "Accept": "application/json",
            "RqUID": "00000000-0000-0000-0000-000000000001",
        },
        data={"scope":"GIGACHAT_API_PERS"},
        timeout=30, verify=False
    )
    if r.status_code != 200:
        raise RuntimeError(f"OAuth {r.status_code}: {r.text}")
    _ACCESS_TOKEN = r.json()["access_token"]; _TS = time.time()
    return _ACCESS_TOKEN

class GigaChatLLM(LLM):
    @property
    def _llm_type(self): return "gigachat"
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        token = get_access_token()
        headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
        payload = {
            "model": "GigaChat:latest",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.40
        }
        r = requests.post("https://gigachat.devices.sberbank.ru/api/v1/chat/completions",
                          headers=headers, json=payload, timeout=60, verify=False)
        if r.status_code != 200:
            token = get_access_token(force=True)
            headers["Authorization"] = f"Bearer {token}"
            r = requests.post("https://gigachat.devices.sberbank.ru/api/v1/chat/completions",
                              headers=headers, json=payload, timeout=60, verify=False)
            if r.status_code != 200:
                raise RuntimeError(f"GigaChat {r.status_code}: {r.text}")
        data = r.json()
        return data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()

llm = GigaChatLLM()

# =========================================================
#                Memory: summary + packed history
# =========================================================
HISTORY_CHARS = 3500
SUMMARY_EVERY = 6

def pack_history(history: List[Dict], summary: Optional[str], limit: int = HISTORY_CHARS) -> str:
    lines = []
    if summary:
        lines.append("[САММАРИ]\n" + summary.strip())
    for m in history[-12:]:
        role = "Пользователь" if m["role"] == "user" else "Ассистент"
        lines.append(f"{role}: {m['content']}")
    text = "\n".join(lines).strip()
    if len(text) > limit:
        text = text[-limit:]
    return text

def summarize_history(history: List[Dict], prev: Optional[str]) -> str:
    chunk = "\n".join([("П:" if m["role"]=="user" else "А:")+m["content"] for m in history[-12:]]) or "—"
    prompt = (
        "Сверни диалог в краткий конспект для консультанта по магистратурам ИТМО "
        "(5–7 тезисов: критерии выбора, ответы, уточнения; без воды).\n\n"
        f"[ПРЕДЫДУЩЕЕ САММАРИ]\n{prev or '—'}\n\n[НОВОЕ]\n{chunk}\n\n[ИТОГ]:"
    )
    return (llm(prompt) or "").strip()

# =========================================================
#          Retrieval: MMR + backoff rephrase via LLM
# =========================================================
def retrieve_multi(queries: List[str], k: int = 12, fetch_k: int = 48, _backoff_done: bool = False):
    """MMR из обоих индексов + дедуп. При пустом результате — одна бэкофф‑перефраза."""
    def _mmr(store, q):
        try:
            retr = store.as_retriever(search_type="mmr",
                                      search_kwargs={"k": k, "fetch_k": fetch_k, "lambda_mult": 0.7})
            return retr.get_relevant_documents(q)
        except Exception:
            return store.as_retriever(search_kwargs={"k": k}).get_relevant_documents(q)

    queries = [q for q in (queries or []) if q]
    seen, out = set(), []
    for q in queries:
        for store in (vs_file, vs_all):
            for d in _mmr(store, q):
                if d.page_content not in seen:
                    seen.add(d.page_content); out.append(d)
            if len(out) >= k: break
        if len(out) >= k: break

    if out or _backoff_done:
        return out[:k]

    # backoff: одна перефраза запросов
    reform = (llm("Переформулируй поисковый запрос по официальным документам магистратур ИТМО из этого текста одной строкой: "
                  + " / ".join(queries)) or "").strip()
    if reform:
        return retrieve_multi([reform], k=k, fetch_k=fetch_k, _backoff_done=True)[:k]
    return []

# =========================================================
#        Planner: LLM строит queries для документов
# =========================================================
def _json_extract(s: str) -> dict:
    m = re.search(r"\{.*\}", s, flags=re.S)
    try: return json.loads(m.group(0)) if m else {}
    except json.JSONDecodeError: return {}

PLAN_PROMPT = """
Ты помогаешь консультировать по магистратурам ИТМО: «Искусственный интеллект» и «Управление ИИ-продуктом».
Тебе дан текст пользователя и краткая история диалога.

Задача:
1) Определи, можно ли связать вопрос с этими програмами и их документами: related = true|false.
2) Если related=true — придумай 3–6 поисковых формулировок (по-русски, терминология учебных планов, дисциплин,
  проектов, итоговой аттестации, поступления).
3) Если related=false — queries=[].

Верни СТРОГО JSON:
{ "related": true, "queries": ["...","..."] }
"""

def build_plan(user_text: str, history: List[Dict]) -> dict:
    hist = "\n".join([("П" if m["role"]=="user" else "А")+": "+m["content"] for m in history[-6:]]) or "—"
    prompt = PLAN_PROMPT + f"\n[ИСТОРИЯ]\n{hist}\n[ПОЛЬЗОВАТЕЛЬ]\n{user_text}\nJSON:"
    raw = (llm(prompt) or "").strip()
    data = _json_extract(raw) or {}
    related = bool(data.get("related", True))
    queries = [q.strip() for q in (data.get("queries") or []) if q and q.strip()]
    return {"related": related, "queries": queries}

# =========================================================
#             Evidence pack + light fact guard
# =========================================================
IMPORTANT_KWS = ["онлайн", "очно", "очно-заочно", "дистанц", "стоимость", "цена", "язык", "англ", "русск", "семестр",
                 "ECTS", "з.е.", "зачётн", "срок", "год", "2 года", "4 семестр", "очная"]

def evidence_pack(docs, n=3):
    snips = []
    seen = set()
    for d in docs:
        t = (d.page_content or "").strip()
        if 50 <= len(t) <= 220 and t not in seen:
            seen.add(t)
            src = d.metadata.get("src","?")
            prog = d.metadata.get("program","?")
            snips.append(f"• [{prog} / {src}] {t}")
            if len(snips) >= n:
                break
    return "\n".join(snips)

def fact_guard(answer: str, context_text: str) -> List[str]:
    miss = []
    ans_l = (answer or "").lower()
    ctx_l = (context_text or "").lower()
    for kw in IMPORTANT_KWS:
        if kw in ans_l and kw not in ctx_l:
            miss.append(kw)
    return miss

# =========================================================
#                     Answer generator
# =========================================================
def answer_always(user_text: str, history: List[Dict], summary: Optional[str], mode: str = "brief", show_evidence: bool = False) -> str:
    user_text = (user_text or "").strip()
    plan = build_plan(user_text, history)
    queries = (plan["queries"] or []) + [user_text]
    docs = retrieve_multi(queries, k=12)

    hist_block = pack_history(history, summary, HISTORY_CHARS)

    if docs:
        context = "\n\n---\n\n".join(d.page_content for d in docs)
        style = "Коротко (2–4 предложения)" if mode == "brief" else "Чуть подробнее (4–7 предложений)"
        prompt = (
            "Ты консультант по двум магистратурам ИТМО: «Искусственный интеллект» и «Управление ИИ‑продуктом».\n"
            f"Говори живо. {style}. Факты бери ТОЛЬКО из [КОНТЕКСТ].\n"
            "Если чего-то нет в контексте — честно скажи и предложи, как сузить вопрос.\n\n"
            f"[ИСТОРИЯ]\n{hist_block or '—'}\n\n"
            f"[КОНТЕКСТ]\n{context}\n\n"
            f"[ВОПРОС]\n{user_text}\n\n"
            "Ответ:"
        )
        out = (llm(prompt) or "").strip() or "В документах такой информации нет."

        miss = fact_guard(out, context)
        if miss:
            out += "\n\n⚠️ По этим пунктам в найденных документах нет прямого упоминания: " + ", ".join(sorted(set(miss))) + "."
            out += " Уточни формулировку (дисциплины, формат, сроки, язык, стоимость) — проверю ещё раз."

        if show_evidence:
            out += "\n\n🔎 Основано на:\n" + (evidence_pack(docs, n=4) or "—")
        return out

    if plan["related"]:
        return ("Похоже, прямого совпадения в документах нет по такой формулировке. "
                "Сузь запрос до дисциплин, учебного плана, проектной/итоговой аттестации, поступления или различий программ — я проверю.")
    else:
        return ("Здесь консультирую только по магистратурам ИТМО «Искусственный интеллект» и «Управление ИИ‑продуктом». "
                "Спроси про дисциплины, учебный план, проекты/защиту, поступление или различия программ — отвечу по документам.")

# =========================================================
#                   Telegram handlers
# =========================================================
nest_asyncio.apply()

async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    context.user_data["history"] = []
    context.user_data["summary"] = None
    context.user_data["mode"] = "brief"
    context.user_data["evidence"] = False
    await update.message.reply_text(
        "Привет! Я консультирую по магистратурам ИТМО: «Искусственный интеллект» и «Управление ИИ‑продуктом».\n"
        "Задавай вопросы: дисциплины, учебный план, проекты, поступление, отличия.\n"
        "Команды: /evidence — цитаты из документов, /brief /detailed, /reset, /help."
    )

async def on_msg(update: Update, context: ContextTypes.DEFAULT_TYPE):
    text = (update.message.text or "").strip()
    history = context.user_data.setdefault("history", [])
    summary = context.user_data.get("summary")
    mode = context.user_data.get("mode", "brief")
    show_evidence = bool(context.user_data.get("evidence", False))

    # ---- быстрый маршрут по contacts.json
    contact_reply = try_contacts_answer(text)
    if contact_reply:
        history += [{"role":"user","content":text},{"role":"assistant","content":contact_reply}]
        context.user_data["history"] = history
        await update.message.reply_text(contact_reply)
        # делаем саммари по расписанию
        if len(history) // 2 % SUMMARY_EVERY == 0 and len(history) >= 2*SUMMARY_EVERY:
            context.user_data["summary"] = summarize_history(history, summary)
        return

    # ---- обычный RAG‑ответ
    reply = answer_always(text, history, summary, mode=mode, show_evidence=show_evidence)
    history += [{"role":"user","content":text},{"role":"assistant","content":reply}]
    context.user_data["history"] = history
    await update.message.reply_text(reply)

    if len(history) // 2 % SUMMARY_EVERY == 0 and len(history) >= 2*SUMMARY_EVERY:
        context.user_data["summary"] = summarize_history(history, summary)

async def cmd_evidence(update: Update, context: ContextTypes.DEFAULT_TYPE):
    context.user_data["evidence"] = not context.user_data.get("evidence", False)
    await update.message.reply_text("Evidence: " + ("ON" if context.user_data["evidence"] else "OFF"))

async def cmd_brief(update: Update, context: ContextTypes.DEFAULT_TYPE):
    context.user_data["mode"] = "brief"
    await update.message.reply_text("Режим: кратко.")

async def cmd_detailed(update: Update, context: ContextTypes.DEFAULT_TYPE):
    context.user_data["mode"] = "detailed"
    await update.message.reply_text("Режим: подробнее.")

async def cmd_reset(update: Update, context: ContextTypes.DEFAULT_TYPE):
    context.user_data["history"] = []
    context.user_data["summary"] = None
    await update.message.reply_text("История очищена.")

async def cmd_help(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await update.message.reply_text(
        "/evidence — вкл/выкл цитаты из документов\n"
        "/brief — короткие ответы\n"
        "/detailed — ответы подробнее\n"
        "/reset — очистить историю\n"
        "/help — помощь"
    )

async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> None:
    print("Handler error:", repr(context.error))

async def main():
    app = ApplicationBuilder().token(TG_TOKEN).build()
    app.add_handler(CommandHandler("start", start))
    app.add_handler(CommandHandler("evidence", cmd_evidence))
    app.add_handler(CommandHandler("brief", cmd_brief))
    app.add_handler(CommandHandler("detailed", cmd_detailed))
    app.add_handler(CommandHandler("reset", cmd_reset))
    app.add_handler(CommandHandler("help", cmd_help))
    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, on_msg))
    app.add_error_handler(error_handler)
    print("Бот запущен. Ждём сообщения…")
    await app.run_polling()

await main()
