In [23]:
!pip -q install sentence-transformers faiss-cpu scikit-learn rapidfuzz unidecode

!pip -q install spacy && python -m spacy download ru_core_news_md

Collecting ru-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.8.0/ru_core_news_md-3.8.0-py3-none-any.whl (41.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#Бойлерплейт: скиллы, конфиг ранжирования

In [24]:
from dataclasses import dataclass
from typing import Dict, Any, List, Tuple
import re, json, os
from pathlib import Path
from unidecode import unidecode
from rapidfuzz import fuzz, process

SKILLS = [
    "music.play", "music.stop", "time.now", "weather.get",
    "alarm.set", "reminder.add", "news.get", "math.calculate",
    "jokes.tell", "timer.start", "system.help"
]

SKILL_BASE_WEIGHT = {
    "music.play": 0.6, "music.stop": 0.5, "time.now": 0.4, "weather.get": 0.6,
    "alarm.set": 0.5, "reminder.add": 0.5, "news.get": 0.4, "math.calculate": 0.4,
    "jokes.tell": 0.3, "timer.start": 0.5, "system.help": 0.1
}

RANK_COEFF = dict(alpha=0.55, beta=0.25, gamma=0.15, delta=0.05)  # можно подправить
THRESHOLD = 0.45  # ниже — считаем неуверенным попаданием


# Маскирование сущностей (простая версия)

In [25]:
CITY_LIST = ["москва", "питер", "санкт-петербург", "казань", "новосибирск", "екатеринбург", "париж", "берлин", "лондон"]
TIME_WORDS = r"(утром|вечером|днём|ночью|сегодня|завтра|послезавтра)"
UNITS = r"(секунд(?:а|ы|)|минут(?:а|ы|)|час(?:а|ов|)|день|дня|дней)"

def normalize(text:str)->str:
    t = text.strip().lower()
    # Лёгкая транслитерация для странных символов
    return t

def mask_entities(text:str)->Tuple[str, Dict[str, Any]]:
    slots = {}
    t = " " + normalize(text) + " "

    # числа/продолжительность
    t = re.sub(r"(\d+)\s*(" + UNITS + r")", lambda m: _store(slots, "duration", m.group(0), "{duration}"), t)

    # время формата HH:MM
    t = re.sub(r"\b([01]?\d|2[0-3]):[0-5]\d\b", lambda m: _store(slots, "time", m.group(0), "{time}"), t)

    # дата вида 12.11.2025
    t = re.sub(r"\b([0-3]?\d\.[01]?\d\.\d{4})\b", lambda m: _store(slots, "date", m.group(0), "{date}"), t)

    # города (простая проверка)
    for c in CITY_LIST:
        t = re.sub(rf"\b{re.escape(c)}\b", lambda m: _store(slots, "city", m.group(0), "{city}"), t)

    # ключевые слова времени
    t = re.sub(TIME_WORDS, lambda m: _store(slots, "timeword", m.group(0), "{time}"), t)

    # музыка: очень грубо — «включи …», «поставь …»
    t = re.sub(r"(включи|воспроизведи)\s+(.+)", lambda m: m.group(1)+" {song}", t)

    t = re.sub(r"\s+", " ", t).strip()
    return t, slots

def _store(slots, key, value, placeholder):
    if key not in slots: slots[key]=[]
    slots[key].append(value.strip())
    return " " + placeholder + " "


#Пример LBD и подготовка эмбеддингов + FAISS

In [26]:
LBD_PATH = Path("lbd.jsonl")

from sentence_transformers import SentenceTransformer
import numpy as np, faiss, json

model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

LBD = []
with open(LBD_PATH,encoding="utf-8") as f:
    for line in f:
        LBD.append(json.loads(line))

lbd_texts = [row["text"] for row in LBD]
lbd_emb = model.encode(lbd_texts, normalize_embeddings=True, convert_to_numpy=True)

index = faiss.IndexFlatIP(lbd_emb.shape[1])  # cosine via dot on normalized
index.add(lbd_emb)


#простой классификатор интентов

In [27]:
# Синтетическое обучение на тех же данных LBD (для beta-слагаемого)
from sklearn.linear_model import LogisticRegression

X = lbd_emb
y = np.array([row["skill"] for row in LBD])
label2id = {l:i for i,l in enumerate(sorted(set(y)))}
id2label = {i:l for l,i in label2id.items()}

y_ids = np.array([label2id[s] for s in y])
clf = LogisticRegression(max_iter=1000).fit(X, y_ids)


# Инференс: поиск, ранжирование, слоты

In [28]:
def rank_skill(query:str, context:Dict[str,Any]=None)->Dict[str,Any]:
    masked, slots = mask_entities(query)
    q_emb = model.encode([masked], normalize_embeddings=True, convert_to_numpy=True)
    sims, idxs = index.search(q_emb, k=min(5, len(LBD)))  # топ-K кандидатов
    sims, idxs = sims[0], idxs[0]

    # prob от логрег
    try:
        clf_proba = float(np.max(clf.predict_proba(q_emb)[0]))
        clf_pred  = id2label[int(clf.predict(q_emb)[0])]
    except Exception:
        clf_proba, clf_pred = 0.0, None

    # контекстный буст (пример: если недавно запрашивали музыку)
    ctx_boost = 0.0
    if context and (last := context.get("last_skill")):
        # лёгкое усиление «возврата» в тот же домен
        domain = lambda s: s.split(".")[0]
        for j in idxs:
            if domain(LBD[j]["skill"]) == domain(last):
                ctx_boost = 0.05
                break

    # вычисляем итоговый скор лучшего совпадения
    best = None
    for sim, j in zip(sims, idxs):
        cand_skill = LBD[j]["skill"]
        score = (RANK_COEFF["alpha"] * float(sim)
                 + RANK_COEFF["beta"] * clf_proba
                 + RANK_COEFF["gamma"] * SKILL_BASE_WEIGHT.get(cand_skill, 0.3)
                 + RANK_COEFF["delta"] * ctx_boost)
        item = dict(skill=cand_skill, score=score, example=LBD[j]["text"])
        if (best is None) or (item["score"]>best["score"]):
            best = item

    # порог
    if not best or best["score"] < THRESHOLD:
        best = dict(skill="system.help", score=best["score"] if best else 0.0, example=None)

    # финальный ответ
    return {
        "query": query,
        "masked": masked,
        "skill": best["skill"],
        "confidence": round(float(best["score"]), 3),
        "slots": slots,
        "matched_example": best["example"],
        "clf_top": clf_pred
    }



# Тестируем

In [29]:
def ask(q):
    res = rank_skill(q)
    print("Query:   ", q)
    print("Masked:  ", res["masked"])
    print("Skill:   ", res["skill"], f"(conf={res['confidence']})")
    print("Example: ", res["matched_example"])
    print("Slots:   ", res["slots"])
    return res

# пример
_ = ask("паставь дору")



Query:    паставь дору
Masked:   паставь дору
Skill:    music.play (conf=0.487)
Example:  воспроизведи {artist}
Slots:    {}
