In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import torch
from transformers import pipeline
import requests
import json
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

class SimpleRAG:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2",
                 llm_model="mistralai/Mistral-7B-Instruct-v0.2"):
        """
        –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è RAG —Å–∏—Å—Ç–µ–º—ã

        ‚ö†Ô∏è –ß–¢–û –ú–ï–ù–Ø–¢–¨ –ó–î–ï–°–¨:
        - model_name: –Ω–∞ –¥—Ä—É–≥–∏–µ –º–æ–¥–µ–ª–∏ –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (–Ω–∞–ø—Ä–∏–º–µ—Ä, "sentence-transformers/all-mpnet-base-v2")
        - llm_model: –Ω–∞ –¥—Ä—É–≥–∏–µ –≥–µ–Ω–µ—Ä–∞—Ç–∏–≤–Ω—ã–µ –º–æ–¥–µ–ª–∏ (–Ω–∞–ø—Ä–∏–º–µ—Ä, "google/flan-t5-large", "facebook/bart-large")
        - device_map: –∏–∑–º–µ–Ω–∏—Ç—å –Ω–∞ "cuda" –µ—Å–ª–∏ –µ—Å—Ç—å GPU, –∏–ª–∏ "cpu" –¥–ª—è –ø—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ–≥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è CPU
        """
        # –ú–æ–¥–µ–ª—å –¥–ª—è —Å–æ–∑–¥–∞–Ω–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        self.embedding_model = SentenceTransformer(model_name)

        # ‚ö†Ô∏è –ï–°–õ–ò –ù–ï–¢ GPU –ò–õ–ò –ú–ê–õ–û –ü–ê–ú–Ø–¢–ò:
        # self.embedding_model = SentenceTransformer(model_name, device='cpu')

        # –Ø–∑—ã–∫–æ–≤–∞—è –º–æ–¥–µ–ª—å –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
        # ‚ö†Ô∏è –î–õ–Ø –£–ü–†–û–©–ï–ù–ò–Ø –ò–õ–ò –ï–°–õ–ò –ù–ï–¢ –î–û–°–¢–£–ü–ê –ö –ë–û–õ–¨–®–ò–ú –ú–û–î–ï–õ–Ø–ú:
        # –º–æ–∂–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –º–µ–Ω—å—à–∏–µ –º–æ–¥–µ–ª–∏ –∏–ª–∏ –≤–æ–æ–±—â–µ —É–±—Ä–∞—Ç—å –≥–µ–Ω–µ—Ä–∞—Ü–∏—é
        try:
            self.generator = pipeline(
                "text-generation",
                model=llm_model,
                torch_dtype=torch.float16,  # ‚ö†Ô∏è –ú–ï–ù–Ø–¢–¨ –ù–ê torch.float32 –µ—Å–ª–∏ –Ω–µ—Ç GPU
                device_map="auto",          # ‚ö†Ô∏è –ú–ï–ù–Ø–¢–¨ –ù–ê "cpu" –µ—Å–ª–∏ –ø—Ä–æ–±–ª–µ–º—ã —Å GPU
                max_length=512
            )
            self.use_generation = True
        except Exception as e:
            print(f"‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å –≥–µ–Ω–µ—Ä–∞—Ç–∏–≤–Ω—É—é –º–æ–¥–µ–ª—å: {e}")
            print("‚ö†Ô∏è –ë—É–¥—É –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å—Å—è —Ç–æ–ª—å–∫–æ –ø–æ–∏—Å–∫ –ø–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞–º")
            self.use_generation = False

        # FAISS –∏–Ω–¥–µ–∫—Å –¥–ª—è –ø–æ–∏—Å–∫–∞
        self.index = None
        self.documents = []

    def load_documents(self, documents: List[str]):
        """
        –ó–∞–≥—Ä—É–∑–∫–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Å–∏—Å—Ç–µ–º—É

        ‚ö†Ô∏è –ß–¢–û –ú–ï–ù–Ø–¢–¨ –ó–î–ï–°–¨:
        - –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫—É —Ç–µ–∫—Å—Ç–∞ (–æ—á–∏—Å—Ç–∫–∞, –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è)
        - –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞
        - –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å —á–∞–Ω–∫–∏–Ω–≥ (—Ä–∞–∑–±–∏–µ–Ω–∏–µ –¥–ª–∏–Ω–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤)
        """
        # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –ü–†–ï–î–û–ë–†–ê–ë–û–¢–ö–£:
        # documents = [self.preprocess_text(doc) for doc in documents]

        # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –ß–ê–ù–ö–ò–ù–ì –î–õ–Ø –î–õ–ò–ù–ù–´–• –î–û–ö–£–ú–ï–ù–¢–û–í:
        # chunked_docs = []
        # for doc in documents:
        #     chunks = self.chunk_text(doc, chunk_size=500)
        #     chunked_docs.extend(chunks)
        # self.documents = chunked_docs

        self.documents = documents
        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")

    def preprocess_text(self, text: str) -> str:
        """
        ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –ü–†–ï–î–û–ë–†–ê–ë–û–¢–ö–£ –¢–ï–ö–°–¢–ê:
        - –û—á–∏—Å—Ç–∫–∞ –æ—Ç HTML —Ç–µ–≥–æ–≤
        - –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –ø—Ä–æ–±–µ–ª–æ–≤
        - –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É (–µ—Å–ª–∏ –Ω—É–∂–Ω–æ)
        - –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤ –∏ —Ç.–¥.
        """
        # –ü—Ä–∏–º–µ—Ä –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏:
        import re
        text = re.sub(r'\s+', ' ', text)  # –£–¥–∞–ª–µ–Ω–∏–µ –ª–∏—à–Ω–∏—Ö –ø—Ä–æ–±–µ–ª–æ–≤
        text = text.strip()
        # text = text.lower()  # ‚ö†Ô∏è –†–ê–°–ö–û–ú–ú–ï–ù–¢–ò–†–û–í–ê–¢–¨ –µ—Å–ª–∏ –Ω—É–∂–µ–Ω –Ω–∏–∂–Ω–∏–π —Ä–µ–≥–∏—Å—Ç—Ä
        return text

    def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
        """
        ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –†–ê–ó–ë–ò–ï–ù–ò–ï –¢–ï–ö–°–¢–ê –ù–ê –ß–ê–ù–ö–ò:
        - –î–ª—è –¥–ª–∏–Ω–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        - chunk_size: —Ä–∞–∑–º–µ—Ä —á–∞–Ω–∫–∞ –≤ —Å–∏–º–≤–æ–ª–∞—Ö/—Å–ª–æ–≤–∞—Ö
        - overlap: –ø–µ—Ä–µ–∫—Ä—ã—Ç–∏–µ –º–µ–∂–¥—É —á–∞–Ω–∫–∞–º–∏
        """
        # –ü—Ä–æ—Å—Ç–∞—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏—è —á–∞–Ω–∫–∏–Ω–≥–∞ –ø–æ —Å–ª–æ–≤–∞–º
        words = text.split()
        chunks = []

        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
            if i + chunk_size >= len(words):
                break

        return chunks

    def build_index(self):
        """–ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –ø–æ–∏—Å–∫–æ–≤–æ–≥–æ –∏–Ω–¥–µ–∫—Å–∞"""
        if not self.documents:
            raise ValueError("–°–Ω–∞—á–∞–ª–∞ –∑–∞–≥—Ä—É–∑–∏—Ç–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã")

        # –°–æ–∑–¥–∞–µ–º —ç–º–±–µ–¥–¥–∏–Ω–≥–∏ –¥–ª—è –≤—Å–µ—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        print("–°–æ–∑–¥–∞–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤...")
        embeddings = self.embedding_model.encode(self.documents)

        # ‚ö†Ô∏è –í–ê–†–ò–ê–ù–¢–´ –ò–ù–î–ï–ö–°–û–í FAISS (–≤—ã–±–∏—Ä–∞—Ç—å –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç —Ä–∞–∑–º–µ—Ä–∞ –¥–∞–Ω–Ω—ã—Ö):
        # IndexFlatIP - –¥–ª—è –º–∞–ª–µ–Ω—å–∫–∏—Ö datasets (—Ç–æ—á–Ω—ã–π –ø–æ–∏—Å–∫)
        # IndexIVFFlat - –¥–ª—è –±–æ–ª—å—à–∏—Ö datasets (–ø—Ä–∏–±–ª–∏–∂–µ–Ω–Ω—ã–π –ø–æ–∏—Å–∫, –±—ã—Å—Ç—Ä–µ–µ)
        dimension = embeddings.shape[1]

        if len(self.documents) < 10000:
            # –î–ª—è –º–∞–ª–µ–Ω—å–∫–∏—Ö –∫–æ–ª–ª–µ–∫—Ü–∏–π
            self.index = faiss.IndexFlatIP(dimension)
        else:
            # ‚ö†Ô∏è –î–õ–Ø –ë–û–õ–¨–®–ò–• –ö–û–õ–õ–ï–ö–¶–ò–ô (>10–∫ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤):
            quantizer = faiss.IndexFlatIP(dimension)
            nlist = 100  # –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
            self.index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
            # –Ω—É–∂–Ω–æ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–∞—Ç—å –∏–Ω–¥–µ–∫—Å
            self.index.train(embeddings)

        # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –≤–µ–∫—Ç–æ—Ä—ã –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings)

        print(f"–ü–æ—Å—Ç—Ä–æ–µ–Ω –∏–Ω–¥–µ–∫—Å —Å {self.index.ntotal} –¥–æ–∫—É–º–µ–Ω—Ç–∞–º–∏")

    def retrieve(self, query: str, k: int = 3, score_threshold: float = 0.0) -> List[Tuple[str, float]]:
        """
        –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

        ‚ö†Ô∏è –ß–¢–û –ú–ï–ù–Ø–¢–¨ –ó–î–ï–°–¨:
        - k: –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        - score_threshold: –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π –ø–æ—Ä–æ–≥ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç–∏
        - –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å —Ä–∞–∑–Ω—ã–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏–∏ –ø–æ–∏—Å–∫–∞
        """
        if self.index is None:
            raise ValueError("–°–Ω–∞—á–∞–ª–∞ –ø–æ—Å—Ç—Ä–æ–π—Ç–µ –∏–Ω–¥–µ–∫—Å")

        # ‚ö†Ô∏è –ü–†–ï–î–û–ë–†–ê–ë–û–¢–ö–ê –ó–ê–ü–†–û–°–ê:
        # query = self.preprocess_text(query)

        # –≠–º–±–µ–¥–¥–∏–Ω–≥ –∑–∞–ø—Ä–æ—Å–∞
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)

        # –ü–æ–∏—Å–∫ –≤ –∏–Ω–¥–µ–∫—Å–µ
        scores, indices = self.index.search(query_embedding, k)

        # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏ –æ—Ü–µ–Ω–∫–∏
        results = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            if idx < len(self.documents) and score >= score_threshold:
                results.append((self.documents[idx], float(score)))

        return results

    def generate_answer(self, query: str, context: List[str]) -> str:
        """
        –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞ –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞

        ‚ö†Ô∏è –ß–¢–û –ú–ï–ù–Ø–¢–¨ –ó–î–ï–°–¨:
        - –®–∞–±–ª–æ–Ω –ø—Ä–æ–º–ø—Ç–∞ (prompt template)
        - –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ (temperature, max_tokens)
        - –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –Ω–µ—Å–∫–æ–ª—å–∫–æ —à–∞–±–ª–æ–Ω–æ–≤ –¥–ª—è —Ä–∞–∑–Ω—ã—Ö —Ç–∏–ø–æ–≤ –∑–∞–ø—Ä–æ—Å–æ–≤
        """
        if not self.use_generation:
            # ‚ö†Ô∏è –ï–°–õ–ò –ì–ï–ù–ï–†–ê–¶–ò–Ø –ù–ï–î–û–°–¢–£–ü–ù–ê - –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –ª—É—á—à–∏–π –¥–æ–∫—É–º–µ–Ω—Ç
            return f"–ù–∞ –æ—Å–Ω–æ–≤–µ –Ω–∞–π–¥–µ–Ω–Ω–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏: {context[0][:200]}..." if context else "–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –Ω–µ –Ω–∞–π–¥–µ–Ω–∞"

        # –ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –∫–æ–Ω—Ç–µ–∫—Å—Ç
        context_text = "\n".join([f"- {doc}" for doc in context])

        # ‚ö†Ô∏è –®–ê–ë–õ–û–ù –ü–†–û–ú–ü–¢–ê - –ú–û–ñ–ù–û –ò–ó–ú–ï–ù–ò–¢–¨ –ü–û–î –ó–ê–î–ê–ß–£:
        prompt = f"""–ù–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –æ—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å.
–ï—Å–ª–∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ, —Å–∫–∞–∂–∏ –æ–± —ç—Ç–æ–º.

–ö–æ–Ω—Ç–µ–∫—Å—Ç:
{context_text}

–í–æ–ø—Ä–æ—Å: {query}

–û—Ç–≤–µ—Ç: """

        # ‚ö†Ô∏è –ü–ê–†–ê–ú–ï–¢–†–´ –ì–ï–ù–ï–†–ê–¶–ò–ò - –ú–û–ñ–ù–û –ù–ê–°–¢–†–û–ò–¢–¨:
        try:
            response = self.generator(
                prompt,
                max_new_tokens=256,      # ‚ö†Ô∏è –ú–ï–ù–Ø–¢–¨: –±–æ–ª—å—à–µ/–º–µ–Ω—å—à–µ —Ç–æ–∫–µ–Ω–æ–≤
                do_sample=True,          # ‚ö†Ô∏è –ú–ï–ù–Ø–¢–¨: False –¥–ª—è –¥–µ—Ç–µ—Ä–º–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–æ–≥–æ –≤—ã–≤–æ–¥–∞
                temperature=0.7,         # ‚ö†Ô∏è –ú–ï–ù–Ø–¢–¨: 0.1-1.0 (–º–µ–Ω—å—à–µ = –±–æ–ª–µ–µ –¥–µ—Ç–µ—Ä–º–∏–Ω–∏—Ä–æ–≤–∞–Ω–æ)
                top_p=0.9,               # ‚ö†Ô∏è –ú–ï–ù–Ø–¢–¨: 0.5-1.0 (nucleus sampling)
                pad_token_id=self.generator.tokenizer.eos_token_id
            )

            generated_text = response[0]['generated_text']
            # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ –æ—Ç–≤–µ—Ç (–ø–æ—Å–ª–µ "–û—Ç–≤–µ—Ç: ")
            if "–û—Ç–≤–µ—Ç: " in generated_text:
                return generated_text.split("–û—Ç–≤–µ—Ç: ")[-1].strip()
            else:
                return generated_text.strip()

        except Exception as e:
            print(f"‚ö†Ô∏è –û—à–∏–±–∫–∞ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏: {e}")
            return f"–ù–∞ –æ—Å–Ω–æ–≤–µ –Ω–∞–π–¥–µ–Ω–Ω–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏: {context[0][:300]}..." if context else "–ù–µ —É–¥–∞–ª–æ—Å—å —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞—Ç—å –æ—Ç–≤–µ—Ç"

    def ask(self, query: str, k: int = 3, include_sources: bool = True) -> Dict:
        """
        –ü–æ–ª–Ω—ã–π –ø–∞–π–ø–ª–∞–π–Ω RAG: –ø–æ–∏—Å–∫ + –≥–µ–Ω–µ—Ä–∞—Ü–∏—è

        ‚ö†Ô∏è –ß–¢–û –ú–ï–ù–Ø–¢–¨ –ó–î–ï–°–¨:
        - k: –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –¥–ª—è –ø–æ–∏—Å–∫–∞
        - include_sources: –≤–∫–ª—é—á–∞—Ç—å –ª–∏ –∏—Å—Ö–æ–¥–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –≤ –æ—Ç–≤–µ—Ç
        - –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏—é –ø–æ –≤—Ä–µ–º–µ–Ω–∏, –∏—Å—Ç–æ—á–Ω–∏–∫—É –∏ —Ç.–¥.
        """
        # –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        retrieved_docs = self.retrieve(query, k)

        if not retrieved_docs:
            return {
                "answer": "–ù–µ –Ω–∞–π–¥–µ–Ω–æ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏",
                "sources": [],
                "query": query
            }

        # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ —Ç–µ–∫—Å—Ç—ã –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        context_texts = [doc for doc, score in retrieved_docs]

        # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
        answer = self.generate_answer(query, context_texts)

        result = {
            "answer": answer,
            "query": query
        }

        if include_sources:
            result["sources"] = retrieved_docs

        return result

# ‚ö†Ô∏è –§–£–ù–ö–¶–ò–Ø –î–õ–Ø –ó–ê–ì–†–£–ó–ö–ò –†–ê–ó–ù–´–• –¢–ò–ü–û–í –î–ê–ù–ù–´–•
def load_data_from_source(source_type: str, source_path: str) -> List[str]:
    """
    –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –∏—Å—Ç–æ—á–Ω–∏–∫–æ–≤

    ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –ü–û–î–î–ï–†–ñ–ö–£ –ù–£–ñ–ù–´–• –§–û–†–ú–ê–¢–û–í:
    - txt, csv, json, pdf, docx, –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö –∏ —Ç.–¥.
    """
    documents = []

    if source_type == "txt":
        # –ó–∞–≥—Ä—É–∑–∫–∞ –∏–∑ —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ —Ñ–∞–π–ª–∞
        try:
            with open(source_path, 'r', encoding='utf-8') as f:
                content = f.read()
                # ‚ö†Ô∏è –ù–ê–°–¢–†–û–ò–¢–¨ –†–ê–ó–î–ï–õ–ï–ù–ò–ï –î–û–ö–£–ú–ï–ù–¢–û–í:
                documents = [doc.strip() for doc in content.split('\n\n') if doc.strip()]
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ —Ñ–∞–π–ª–∞ {source_path}: {e}")

    elif source_type == "csv":
        # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –ó–ê–ì–†–£–ó–ö–£ –ò–ó CSV:
        try:
            df = pd.read_csv(source_path)
            # –í—ã–±–∏—Ä–∞–µ–º –∫–æ–ª–æ–Ω–∫—É —Å —Ç–µ–∫—Å—Ç–æ–º
            if 'text' in df.columns:
                documents = df['text'].dropna().tolist()
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ CSV {source_path}: {e}")

    elif source_type == "json":
        # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –ó–ê–ì–†–£–ó–ö–£ –ò–ó JSON:
        try:
            with open(source_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç —Å—Ç—Ä—É–∫—Ç—É—Ä—ã JSON
                if isinstance(data, list):
                    documents = [str(item) for item in data]
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ JSON {source_path}: {e}")

    else:
        print(f"‚ö†Ô∏è –ù–µ–ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ–º—ã–π —Ç–∏–ø –∏—Å—Ç–æ—á–Ω–∏–∫–∞: {source_type}")

    return documents

# ‚ö†Ô∏è –ö–û–ù–§–ò–ì–£–†–ê–¶–ò–Ø - –ú–ï–ù–Ø–¢–¨ –ü–ê–†–ê–ú–ï–¢–†–´ –ó–î–ï–°–¨
class RAGConfig:
    """–ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è RAG —Å–∏—Å—Ç–µ–º—ã"""

    # ‚ö†Ô∏è –ú–û–î–ï–õ–ò
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # –ê–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤—ã: "all-mpnet-base-v2"
    LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"           # –ê–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤—ã: "facebook/bart-large"

    # ‚ö†Ô∏è –ü–ê–†–ê–ú–ï–¢–†–´ –ü–û–ò–°–ö–ê
    TOP_K = 3                   # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤–æ–∑–≤—Ä–∞—â–∞–µ–º—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    SCORE_THRESHOLD = 0.0       # –ú–∏–Ω–∏–º–∞–ª—å–Ω—ã–π –ø–æ—Ä–æ–≥ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç–∏

    # ‚ö†Ô∏è –ü–ê–†–ê–ú–ï–¢–†–´ –ì–ï–ù–ï–†–ê–¶–ò–ò
    MAX_NEW_TOKENS = 256
    TEMPERATURE = 0.7
    DO_SAMPLE = True

    # ‚ö†Ô∏è –ü–ê–†–ê–ú–ï–¢–†–´ –î–ê–ù–ù–´–•
    CHUNK_SIZE = 500            # –î–ª—è —á–∞–Ω–∫–∏–Ω–≥–∞
    CHUNK_OVERLAP = 50          # –ü–µ—Ä–µ–∫—Ä—ã—Ç–∏–µ —á–∞–Ω–∫–æ–≤

def main():
    """
    ‚ö†Ô∏è –û–°–ù–û–í–ù–ê–Ø –§–£–ù–ö–¶–ò–Ø - –ú–ï–ù–Ø–¢–¨ –ü–û–î –°–í–û–ò –î–ê–ù–ù–´–ï –ò –ó–ê–î–ê–ß–ò
    """

    # 1. ‚ö†Ô∏è –ò–ù–ò–¶–ò–ê–õ–ò–ó–ê–¶–ò–Ø –° –ö–û–ù–§–ò–ì–û–ú
    rag_system = SimpleRAG(
        model_name=RAGConfig.EMBEDDING_MODEL,
        llm_model=RAGConfig.LLM_MODEL
    )

    # 2. ‚ö†Ô∏è –ó–ê–ì–†–£–ó–ö–ê –î–ê–ù–ù–´–• - –í–´–ë–†–ê–¢–¨ –ù–£–ñ–ù–´–ô –°–ü–û–°–û–ë:

    # –°–ø–æ—Å–æ–± A: –ü—Ä—è–º–∞—è –∑–∞–≥—Ä—É–∑–∫–∞ —Å–ø–∏—Å–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤
    documents = [
        "–ò—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω—ã–π –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç - —ç—Ç–æ –æ–±–ª–∞—Å—Ç—å –∫–æ–º–ø—å—é—Ç–µ—Ä–Ω—ã—Ö –Ω–∞—É–∫...",
        "–ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ —è–≤–ª—è–µ—Ç—Å—è –ø–æ–¥—Ä–∞–∑–¥–µ–ª–æ–º –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç–∞...",
        # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –°–í–û–ò –î–û–ö–£–ú–ï–ù–¢–´ –ó–î–ï–°–¨
    ]

    # –°–ø–æ—Å–æ–± B: –ó–∞–≥—Ä—É–∑–∫–∞ –∏–∑ —Ñ–∞–π–ª–∞
    # documents = load_data_from_source("txt", "data/my_documents.txt")
    # documents = load_data_from_source("csv", "data/documents.csv")
    # documents = load_data_from_source("json", "data/documents.json")

    # –°–ø–æ—Å–æ–± C: –ó–∞–≥—Ä—É–∑–∫–∞ –∏–∑ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö –∏–ª–∏ API
    # documents = load_from_database()  # ‚ö†Ô∏è –†–ï–ê–õ–ò–ó–û–í–ê–¢–¨ –§–£–ù–ö–¶–ò–Æ

    rag_system.load_documents(documents)
    rag_system.build_index()

    # 3. ‚ö†Ô∏è –¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï - –î–û–ë–ê–í–ò–¢–¨ –°–í–û–ò –ó–ê–ü–†–û–°–´
    test_queries = [
        "–ß—Ç–æ —Ç–∞–∫–æ–µ –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω—ã–π –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç?",
        "–ö–∞–∫–∏–µ –µ—Å—Ç—å –≤–∏–¥—ã –º–∞—à–∏–Ω–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è?",
        # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ –°–í–û–ò –¢–ï–°–¢–û–í–´–ï –ó–ê–ü–†–û–°–´
    ]

    print("=" * 60)
    print("–¢–ï–°–¢–ò–†–û–í–ê–ù–ò–ï RAG –°–ò–°–¢–ï–ú–´")
    print("=" * 60)

    for query in test_queries:
        print(f"\nüìù –í–æ–ø—Ä–æ—Å: {query}")
        result = rag_system.ask(query, k=RAGConfig.TOP_K)

        print(f"ü§ñ –û—Ç–≤–µ—Ç: {result['answer']}")
        if 'sources' in result:
            print("üìö –ò—Å—Ç–æ—á–Ω–∏–∫–∏:")
            for i, (source, score) in enumerate(result['sources']):
                print(f"   {i+1}. [—Å—Ö–æ–¥—Å—Ç–≤–æ: {score:.3f}] {source[:100]}...")
        print("-" * 60)

    # 4. ‚ö†Ô∏è –ì–ï–ù–ï–†–ê–¶–ò–Ø –°–ê–ë–ú–ò–¢–ê - –ù–ê–°–¢–†–û–ò–¢–¨ –ü–û–î –§–û–†–ú–ê–¢ –¢–†–ï–ë–û–í–ê–ù–ò–ô
    generate_submission(
        test_queries,
        rag_system,
        output_file="my_rag_predictions.json"
    )

def generate_submission(test_queries: List[str], rag_system: SimpleRAG,
                       output_file: str = "rag_predictions.json"):
    """
    –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∞–±–º–∏—Ç–∞ –¥–ª—è —Ç–µ—Å—Ç–æ–≤—ã—Ö –∑–∞–ø—Ä–æ—Å–æ–≤

    ‚ö†Ô∏è –ù–ê–°–¢–†–û–ò–¢–¨ –§–û–†–ú–ê–¢ –í–´–í–û–î–ê –ü–û–î –¢–†–ï–ë–û–í–ê–ù–ò–Ø:
    - –°—Ç—Ä—É–∫—Ç—É—Ä–∞ JSON
    - –í–∫–ª—é—á–∞–µ–º—ã–µ –ø–æ–ª—è
    - –§–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –æ—Ç–≤–µ—Ç–æ–≤
    """
    predictions = []

    for i, query in enumerate(test_queries):
        print(f"–û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–ø—Ä–æ—Å–∞ {i+1}/{len(test_queries)}: {query}")

        result = rag_system.ask(query, k=RAGConfig.TOP_K)

        # ‚ö†Ô∏è –ù–ê–°–¢–†–û–ò–¢–¨ –°–¢–†–£–ö–¢–£–†–£ –°–ê–ë–ú–ò–¢–ê:
        prediction = {
            "id": i + 1,                    # ‚ö†Ô∏è –ú–û–ñ–ï–¢ –ë–´–¢–¨ –î–†–£–ì–û–ô ID
            "query": query,
            "answer": result["answer"],
            "sources": [
                {
                    "text": doc[:200] + "...",  # ‚ö†Ô∏è –ù–ê–°–¢–†–û–ò–¢–¨ –î–õ–ò–ù–£ –¢–ï–ö–°–¢–ê
                    "score": float(score),
                    "source_id": j            # ‚ö†Ô∏è –î–û–ë–ê–í–ò–¢–¨ ID –ò–°–¢–û–ß–ù–ò–ö–ê –ï–°–õ–ò –ï–°–¢–¨
                }
                for j, (doc, score) in enumerate(result.get("sources", []))
            ],
            "timestamp": pd.Timestamp.now().isoformat()  # ‚ö†Ô∏è –û–ü–¶–ò–û–ù–ê–õ–¨–ù–û
        }

        predictions.append(prediction)

    # ‚ö†Ô∏è –ù–ê–°–¢–†–û–ò–¢–¨ –§–û–†–ú–ê–¢ –°–û–•–†–ê–ù–ï–ù–ò–Ø:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)

    print(f"‚úÖ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ —Ñ–∞–π–ª: {output_file}")
    print(f"üìä –û–±—Ä–∞–±–æ—Ç–∞–Ω–æ –∑–∞–ø—Ä–æ—Å–æ–≤: {len(predictions)}")

    return predictions

if __name__ == "__main__":
    # ‚ö†Ô∏è –í–ö–õ–Æ–ß–ò–¢–¨/–í–´–ö–õ–Æ–ß–ò–¢–¨ –†–ê–ó–ù–´–ï –ß–ê–°–¢–ò:

    # –ó–∞–ø—É—Å–∫ –ø–æ–ª–Ω–æ–π –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏–∏
    main()

    # –ò–ª–∏ —Ç–æ–ª—å–∫–æ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ
    # test_my_rag()

    # –ò–ª–∏ —Ç–æ–ª—å–∫–æ –≥–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∞–±–º–∏—Ç–∞
    # generate_my_submission()