In [11]:
import re
import json
import pickle
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
import random

# -----------------------------------
# 1. Өгөгдлийг унших
# -----------------------------------
words = []
classes = []
documents = []
ignore_words = ['?', '!', '.', ',']

data_file = open('intents.json', encoding='utf-8').read()
intents = json.loads(data_file)

# -----------------------------------
# 2. Монгол хэлэнд зориулсан токенчлол функц
# -----------------------------------
def simple_mn_tokenize(sentence):
    """Үг болон тоог ялгах, бүх үгийг бага үсэгт хувиргах"""
    return re.findall(r'\b\w+\b', sentence.lower())

# -----------------------------------
# 3. Өгөгдлийг боловсруулах
# -----------------------------------
for intent in intents['intents']:
    for pattern in intent['patterns']:
        w = simple_mn_tokenize(pattern)
        words.extend(w)
        documents.append((w, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = sorted(list(set(words)))
classes = sorted(list(set(classes)))

print(len(documents), "documents (patterns)")
print(len(classes), "classes (intents)", classes)
print(len(words), "unique words", words[:10])

pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

# -----------------------------------
# 4. Сургалтын өгөгдөл үүсгэх (Bag of Words)
# -----------------------------------
training = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    pattern_words = doc[0]

    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

random.shuffle(training)
training = np.array(training, dtype=object)
train_x = np.array(list(training[:, 0]))
train_y = np.array(list(training[:, 1]))

print("\nTraining data created")

# -----------------------------------
# 5. Загварыг үүсгэх
# -----------------------------------
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

hist = model.fit(train_x, train_y, epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5')

print("\n✅ Загвар амжилттай сургагдаж, chatbot_model.h5 нэрээр хадгалагдлаа.")


66 documents (patterns)
23 classes (intents) ['1201_хуралдай', '1203_ван_хаантай_дайсан_болсон', '1204_наймантай_тулалдсан', '1206_их_хуралдай', '13_жигүүрт', 'алтан_ургийн_тангараг', 'балжун_арлын_хуралдай', 'боорчи_зэв', 'бэлгүтэй', 'бөртэ_хулгайлагдсан', 'ван_хаан', 'есүхэй_баатар', 'жамуха', 'зургаан_түмэн', 'мнт_хэн_бичсэн', 'мнт_хэчнээн_зүйлтэй', 'тайчуудын_хулгай', 'тэмүүжин_төрсөн_явдал', 'халхын_голын_тулаан', 'хуйлдар_тулалдаан', 'чигис_хаан_цол', 'шихи_хутуг', 'өөлэн_эх']
156 unique words ['1196', '1201', '1202', '1206', '13', '95', 'аав', 'аавыг', 'аврах', 'авсан']

Training data created
Epoch 1/200
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0311 - loss: 3.2202      
Epoch 2/200
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1216 - loss: 3.1172 
Epoch 3/200
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1903 - loss: 3.0793 
Epoch 4/200
[1m14/14[0m [




✅ Загвар амжилттай сургагдаж, chatbot_model.h5 нэрээр хадгалагдлаа.


In [1]:
# -*- coding: utf-8 -*-
import json, re, math
from collections import defaultdict
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display, HTML
import ipywidgets as widgets
from datetime import datetime

# -----------------------------
# 1. Монгол хэлний токенчлол
# -----------------------------
def simple_mn_tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# -----------------------------
# 2. Өгөгдөл унших
# -----------------------------
with open("intents.json", encoding="utf-8") as f:
    data = json.load(f)

documents = []
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        documents.append({"text": pattern, "tag": intent["tag"]})

corpus = [d["text"] for d in documents]
tokenized_corpus = [simple_mn_tokenize(d) for d in corpus]

# -----------------------------
# 3. Inverted index
# -----------------------------
inverted_index = defaultdict(list)
for idx, tokens in enumerate(tokenized_corpus):
    for token in set(tokens):
        inverted_index[token].append(idx)

# -----------------------------
# 4. TF-IDF
# -----------------------------
tfidf_vectorizer = TfidfVectorizer(tokenizer=simple_mn_tokenize)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# -----------------------------
# 5. BM25
# -----------------------------
bm25 = BM25Okapi(tokenized_corpus)

# -----------------------------
# 6. Search functions
# -----------------------------
def boolean_search(query):
    tokens = simple_mn_tokenize(query)
    if not tokens: return []
    sets = [set(inverted_index.get(t, [])) for t in tokens]
    return list(set.intersection(*sets)) if sets else []

def tfidf_search(query, top_k=10):
    q_vec = tfidf_vectorizer.transform([query])
    sim = (tfidf_matrix @ q_vec.T).toarray().flatten()
    idxs = np.argsort(sim)[::-1][:top_k]
    scores = sim[idxs]
    return idxs, scores

def bm25_search(query, top_k=10):
    tokens = simple_mn_tokenize(query)
    scores = bm25.get_scores(tokens)
    idxs = np.argsort(scores)[::-1][:top_k]
    return idxs, scores[idxs]

# -----------------------------
# 7. Evaluation metrics
# -----------------------------
def precision_at_k(ranked, relevant, k=10):
    hits = sum([1 for i in ranked[:k] if i in relevant])
    return hits / k

def average_precision(ranked, relevant):
    hits = 0
    sum_prec = 0
    for i, idx in enumerate(ranked, start=1):
        if idx in relevant:
            hits += 1
            sum_prec += hits/i
    return sum_prec / max(len(relevant),1)

def ndcg_at_k(ranked, relevant, k=10):
    dcg = 0
    for i, idx in enumerate(ranked[:k]):
        if idx in relevant:
            dcg += 1/math.log2(i+2)
    idcg = sum([1/math.log2(i+2) for i in range(min(len(relevant),k))])
    return dcg/idcg if idcg>0 else 0

# -----------------------------
# 8. Jupyter интерфейс
# -----------------------------
chat_history = widgets.Output()
text_input = widgets.Text(
    value='',
    placeholder='Монголын Нууц Товчооны тухай асуулт...',
    description='Та:',
    disabled=False
)

# Query-г чат түүхэд нэмэх
def append_user_message(msg):
    now = datetime.now().strftime("%H:%M")
    html = f"<div style='text-align:right; margin:5px;'><b>Та:</b> {msg} <span style='font-size:0.7em;color:#888;'>[{now}]</span></div>"
    with chat_history: display(HTML(html))

# Search үр дүнг чатад нэмэх (top_n=2)
def append_results(title, idxs, scores, top_n=2):
    html = f"<b>{title}:</b><br>"
    for i, idx in enumerate(idxs[:top_n]):
        html += f"{i+1}. ({scores[i]:.3f}) {documents[idx]['text']}<br>"
    with chat_history: display(HTML(html))

# Evaluation харуулах (top_k=2)
def append_eval(title, ranked, relevant, top_k=2):
    p_k = precision_at_k(ranked, relevant, k=top_k)
    ap = average_precision(ranked, relevant)
    ndcg = ndcg_at_k(ranked, relevant, k=top_k)
    html = f"<b>{title} үнэлгээ (top {top_k}):</b> P@{top_k}={p_k:.2f}, AP={ap:.2f}, NDCG={ndcg:.2f}<br>"
    with chat_history: display(HTML(html))

# Submit event
def on_submit(sender):
    query = text_input.value
    text_input.value = ''
    if not query: return
    append_user_message(query)

    # Relevant docs (tag-д суурилсан simple example)
    relevant_tags = [d["tag"] for d in documents if query.lower() in d["text"].lower()]
    relevant_docs = [i for i, d in enumerate(documents) if d["tag"] in relevant_tags]

    # Boolean
    b_idx = boolean_search(query)
    append_results("Boolean Search Top 2", b_idx, [1]*len(b_idx))
    append_eval("Boolean", b_idx, relevant_docs, top_k=2)

    # TF-IDF
    tf_idx, tf_scores = tfidf_search(query)
    append_results("TF-IDF Top 2", tf_idx, tf_scores, top_n=2)
    append_eval("TF-IDF", tf_idx, relevant_docs, top_k=2)

    # BM25
    bm_idx, bm_scores = bm25_search(query)
    append_results("BM25 Top 2", bm_idx, bm_scores, top_n=2)
    append_eval("BM25", bm_idx, relevant_docs, top_k=2)
text_input.on_submit(on_submit)

# Layout
chat_box = widgets.VBox([
    widgets.HTML("<h3>Монголын Нууц Товчоо IR Chatbot</h3>"),
    widgets.VBox([chat_history], layout=widgets.Layout(height='400px', overflow='auto', border='1px solid #ccc', padding='10px')),
    text_input
], layout=widgets.Layout(width='600px', border='2px solid #007bff', padding='10px', border_radius='10px'))

display(chat_box)

# Эхний мэндчилгээ
with chat_history:
    display(HTML("<h4 style='color:#007bff;'>Сайн байна уу! Асуулт бичээд Enter дарна уу.</h4>"))


FileNotFoundError: [Errno 2] No such file or directory: 'intents.json'