In [None]:
# AI Constitution Assistant in Jupyter Notebook
# Kazakh, Russian, English support | KazLLM + Ollama + LangChain

# 📦 STEP 1: Imports and Environment
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.chat_models import ChatOllama
from langchain.chains import RetrievalQA
from langdetect import detect
import torch
import os


In [None]:
# ✅ STEP 2: Language Detection
def detect_language(text):
    lang_code = detect(text)
    return {
        'kk': 'kazakh',
        'ru': 'russian',
        'en': 'english'
    }.get(lang_code, 'unknown')

# ✅ STEP 3: Load KazLLM from HuggingFace (once)
def load_kazllm():
    tokenizer = AutoTokenizer.from_pretrained("kaznlp/kazllm-7b")
    model = AutoModelForCausalLM.from_pretrained("kaznlp/kazllm-7b", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
    return pipe

kazllm_pipe = load_kazllm()

# ✅ STEP 4: Set file paths
files = {
    'kazakh': 'constitution_kz.txt',
    'russian': 'constitution_ru.txt',
    'english': 'constitution_en.txt'
}

In [None]:
# ✅ STEP 5: Function to load and split documents
def get_chunks(file_path):
    loader = TextLoader(file_path, encoding='utf-8')
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_documents(docs)

# ✅ STEP 6: Retrieve relevant chunks via Chroma
retriever_cache = {}

def get_retriever(lang):
    if lang not in retriever_cache:
        chunks = get_chunks(files[lang])
        embeddings = OllamaEmbeddings(model="nomic-embed-text")
        db = Chroma.from_documents(chunks, embedding=embeddings)
        retriever_cache[lang] = db.as_retriever()
    return retriever_cache[lang]

# ✅ STEP 7: Answer a question
def get_answer(question):
    lang = detect_language(question)
    if lang == 'unknown':
        return "⛔ Unsupported language. Please use Kazakh, Russian or English."
    print(f"🌐 Language detected: {lang}")
    retriever = get_retriever(lang)
    context_docs = retriever.get_relevant_documents(question)
    context_text = "\n".join([doc.page_content for doc in context_docs])

    if lang == 'kazakh':
        prompt = f"{context_text}\nСұрақ: {question}\nЖауап:"
        result = kazllm_pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)
        return result[0]['generated_text'].split("Жауап:")[-1].strip()
    else:
        llm = ChatOllama(model="mistral")
        qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
        return qa.run(question)

# ✅ STEP 8: Ask a question
example_question = "Қазақстан Республикасының Президентінің өкілеттіктері қандай?"
response = get_answer(example_question)
print("\n📌 Жауап / Ответ / Answer:\n")
print(response)
