In [20]:
import json
import os
import uuid
from typing import List, Dict, Tuple, Optional, Any
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue, Range
import rank_bm25
from rank_bm25 import BM25Okapi
from mistralai import Mistral

import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

In [36]:
# –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
QDRANT_COLLECTION_EMBEDDINGS = "telegram_embeddings"
QDRANT_COLLECTION_BM25 = "telegram_bm25"
VECTOR_SIZE = 384

class TelegramRAGSystem:
    def __init__(self, data_dir: str = "./rag_data", mistral_api_key: str = None):
        self.data_dir = data_dir
        self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)
        
        # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è Qdrant
        self.qdrant_embeddings = QdrantClient(path=os.path.join(data_dir, "embeddings_db"))
        self.qdrant_bm25 = QdrantClient(path=os.path.join(data_dir, "bm25_db"))
        
        # BM25
        self.bm25 = None
        self.bm25_documents = []
        self.bm25_doc_ids = []  # –•—Ä–∞–Ω–∏–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ ID
        self.bm25_uuid_map = {}  # –ú–∞–ø–ø–∏–Ω–≥ UUID -> –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π ID
        
        # Mistral –∫–ª–∏–µ–Ω—Ç
        self.mistral_client = Mistral(api_key=mistral_api_key) if mistral_api_key else None
        
        self._initialize_collections()
        self._load_bm25_data()
    
    def _generate_uuid(self, original_id: str) -> str:
        """–ì–µ–Ω–µ—Ä–∞—Ü–∏—è UUID –Ω–∞ –æ—Å–Ω–æ–≤–µ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–≥–æ ID"""
        return str(uuid.uuid5(uuid.NAMESPACE_DNS, original_id))
    
    def _initialize_collections(self):
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∫–æ–ª–ª–µ–∫—Ü–∏–π –≤ Qdrant"""
        # –ö–æ–ª–ª–µ–∫—Ü–∏—è –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        if not self.qdrant_embeddings.collection_exists(QDRANT_COLLECTION_EMBEDDINGS):
            self.qdrant_embeddings.create_collection(
                collection_name=QDRANT_COLLECTION_EMBEDDINGS,
                vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
            )
        
        # –ö–æ–ª–ª–µ–∫—Ü–∏—è –¥–ª—è BM25
        if not self.qdrant_bm25.collection_exists(QDRANT_COLLECTION_BM25):
            self.qdrant_bm25.create_collection(
                collection_name=QDRANT_COLLECTION_BM25,
                vectors_config=VectorParams(size=1, distance=Distance.COSINE)
            )
    
    def _load_bm25_data(self):
        """–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è BM25 –∏–∑ Qdrant"""
        points = self.qdrant_bm25.scroll(
            collection_name=QDRANT_COLLECTION_BM25,
            limit=10000
        )[0]
        
        documents = []
        doc_ids = []
        
        for point in points:
            if point.payload and 'text' in point.payload and 'original_id' in point.payload:
                documents.append(self._tokenize_text(point.payload['text']))
                doc_ids.append(point.payload['original_id'])
                self.bm25_uuid_map[point.id] = point.payload['original_id']
        
        if documents:
            self.bm25 = BM25Okapi(documents)
            self.bm25_documents = documents
            self.bm25_doc_ids = doc_ids
    
    def _tokenize_text(self, text: str) -> List[str]:
        """
        –ü—Ä–æ–¥–≤–∏–Ω—É—Ç–∞—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ –¥–ª—è BM25
        Args:
            text: –≤—Ö–æ–¥–Ω–æ–π —Ç–µ–∫—Å—Ç –¥–ª—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏
        Returns:
            List[str]: —Å–ø–∏—Å–æ–∫ —Ç–æ–∫–µ–Ω–æ–≤ (–æ—Å–Ω–æ–≤ —Å–ª–æ–≤)
        """
        self.stemmer = SnowballStemmer('russian')
        # –ü–∞—Ç—Ç–µ—Ä–Ω –¥–ª—è –≤—ã–¥–µ–ª–µ–Ω–∏—è —Å–ª–æ–≤ (–±—É–∫–≤—ã, —Ü–∏—Ñ—Ä—ã, –¥–µ—Ñ–∏—Å—ã, –∞–ø–æ—Å—Ç—Ä–æ—Ñ—ã)
        self.word_pattern = re.compile(r"[a-zA-Z–∞-—è–ê-–Ø—ë–Å0-9]+(?:[-'‚Äô][a-zA-Z–∞-—è–ê-–Ø—ë–Å0-9]+)*")
        
        # –ü—Ä–∏–≤–æ–¥–∏–º –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
        text = text.lower()
        # –ò–∑–≤–ª–µ–∫–∞–µ–º —Å–ª–æ–≤–∞ —Å –ø–æ–º–æ—â—å—é —Ä–µ–≥—É–ª—è—Ä–Ω–æ–≥–æ –≤—ã—Ä–∞–∂–µ–Ω–∏—è
        words = self.word_pattern.findall(text)
        # –ü—Ä–∏–º–µ–Ω—è–µ–º —Å—Ç–µ–º–º–∏–Ω–≥ –∫ –∫–∞–∂–¥–æ–º—É —Å–ª–æ–≤—É
        tokens = [self.stemmer.stem(word) for word in words]
        # –§–∏–ª—å—Ç—Ä—É–µ–º —Å–ª–∏—à–∫–æ–º –∫–æ—Ä–æ—Ç–∫–∏–µ —Ç–æ–∫–µ–Ω—ã (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)
        tokens = [token for token in tokens if len(token) > 2]
        
        return tokens
    
    def add_documents(self, documents: List[Dict[str, Any]]):
        """
        –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –Ω–æ–≤—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ –æ–±–µ –±–∞–∑—ã –¥–∞–Ω–Ω—ã—Ö
        """
        points_embeddings = []
        points_bm25 = []
        new_bm25_docs = []
        
        for doc in documents:
            # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è UUID
            doc_uuid = self._generate_uuid(doc['id'])
            
            # –°–æ–∑–¥–∞–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–∞
            embedding = self.embedding_model.encode(doc['text']).tolist()
            
            # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ payload
            payload = {
                'text': doc['text'],
                'user_id': doc['user_id'],
                'timestamp': doc['timestamp'],
                'original_id': doc['id']  # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π ID
            }
            
            points_embeddings.append(PointStruct(
                id=doc_uuid,
                vector=embedding,
                payload=payload
            ))
            
            points_bm25.append(PointStruct(
                id=doc_uuid,
                vector=[1.0],
                payload=payload
            ))
            
            new_bm25_docs.append(self._tokenize_text(doc['text']))
            self.bm25_uuid_map[doc_uuid] = doc['id']
        
        # –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –≤ Qdrant
        if points_embeddings:
            self.qdrant_embeddings.upsert(
                collection_name=QDRANT_COLLECTION_EMBEDDINGS,
                points=points_embeddings
            )
        
        if points_bm25:
            self.qdrant_bm25.upsert(
                collection_name=QDRANT_COLLECTION_BM25,
                points=points_bm25
            )
            
            # –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ BM25
            if self.bm25 is None:
                self.bm25 = BM25Okapi(new_bm25_docs)
                self.bm25_documents = new_bm25_docs
                self.bm25_doc_ids = [doc['id'] for doc in documents]
            else:
                self.bm25_documents.extend(new_bm25_docs)
                self.bm25_doc_ids.extend([doc['id'] for doc in documents])
                self.bm25 = BM25Okapi(self.bm25_documents)
    
    def recalculate_bm25(self):
        """–ü–æ–ª–Ω—ã–π –ø–µ—Ä–µ—Å—á–µ—Ç BM25 –ø–æ –≤—Å–µ–º –¥–æ–∫—É–º–µ–Ω—Ç–∞–º –≤ –±–∞–∑–µ"""
        self._load_bm25_data()
    
    def _build_filters(self, user_id: Optional[str] = None, 
                      start_timestamp: Optional[float] = None, 
                      end_timestamp: Optional[float] = None) -> Optional[Filter]:
        """–ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ —Ñ–∏–ª—å—Ç—Ä–æ–≤ –¥–ª—è Qdrant"""
        conditions = []
        
        if user_id:
            conditions.append(FieldCondition(
                key="user_id",
                match=MatchValue(value=user_id)
            ))
        
        if start_timestamp is not None or end_timestamp is not None:
            timestamp_range = {}
            if start_timestamp is not None:
                timestamp_range["gte"] = start_timestamp
            if end_timestamp is not None:
                timestamp_range["lte"] = end_timestamp
            
            conditions.append(FieldCondition(
                key="timestamp",
                range=Range(**timestamp_range)
            ))
        
        return Filter(must=conditions) if conditions else None
    
    def search(self, query: str, k: int = 10, m: int = 50,
               user_id: Optional[str] = None,
               start_timestamp: Optional[float] = None,
               end_timestamp: Optional[float] = None) -> List[Tuple[str, float]]:
        """
        –ü–æ–∏—Å–∫ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º RRF
        """
        filters = self._build_filters(user_id, start_timestamp, end_timestamp)
        
        # –ü–æ–∏—Å–∫ –ø–æ —ç–º–±–µ–¥–¥–∏–Ω–≥–∞–º
        query_embedding = self.embedding_model.encode(query).tolist()
        embedding_results = self.qdrant_embeddings.search(
            collection_name=QDRANT_COLLECTION_EMBEDDINGS,
            query_vector=query_embedding,
            query_filter=filters,
            limit=m
        )
        
        # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º UUID –æ–±—Ä–∞—Ç–Ω–æ –≤ –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ ID –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        embedding_results_converted = []
        for result in embedding_results:
            original_id = result.payload.get('original_id', result.id)
            embedding_results_converted.append((original_id, result.score))
        
        # –ü–æ–∏—Å–∫ –ø–æ BM25
        bm25_results = []
        if self.bm25:
            # –ü–æ–ª—É—á–∞–µ–º –≤—Å–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã —Å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–µ–π
            all_points = self.qdrant_bm25.scroll(
                collection_name=QDRANT_COLLECTION_BM25,
                scroll_filter=filters,
                limit=10000
            )[0]
            
            filtered_docs = []
            filtered_original_ids = []
            
            for point in all_points:
                original_id = point.payload.get('original_id')
                if original_id and original_id in self.bm25_doc_ids:
                    idx = self.bm25_doc_ids.index(original_id)
                    filtered_docs.append(self.bm25_documents[idx])
                    filtered_original_ids.append(original_id)
            
            if filtered_docs:
                temp_bm25 = BM25Okapi(filtered_docs)
                tokenized_query = self._tokenize_text(query)
                bm25_scores = temp_bm25.get_scores(tokenized_query)
                
                # –°–æ—Ä—Ç–∏—Ä—É–µ–º –ø–æ —É–±—ã–≤–∞–Ω–∏—é score
                bm25_indices = np.argsort(bm25_scores)[::-1][:m]
                bm25_results = [(filtered_original_ids[i], float(bm25_scores[i])) 
                               for i in bm25_indices if bm25_scores[i] > 0]
        
        # –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ —Å RRF
        return self._rrf_fusion(embedding_results_converted, bm25_results, k=k)
    
    def _rrf_fusion(self, embedding_results: List, bm25_results: List, k: int = 10, k_rrf: int = 60) -> List[Tuple[str, float]]:
        """
        Reciprocal Rank Fusion
        """
        ranked_lists = []
        
        # –†–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
        if embedding_results:
            embedding_rank = {doc_id: rank for rank, (doc_id, _) in enumerate(embedding_results)}
            ranked_lists.append(embedding_rank)
        
        # –†–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–ª—è BM25
        if bm25_results:
            bm25_rank = {doc_id: rank for rank, (doc_id, _) in enumerate(bm25_results)}
            ranked_lists.append(bm25_rank)
        
        # RRF –ø–æ–¥—Å—á–µ—Ç
        rrf_scores = {}
        for ranking in ranked_lists:
            for doc_id, rank in ranking.items():
                if doc_id not in rrf_scores:
                    rrf_scores[doc_id] = 0
                rrf_scores[doc_id] += 1.0 / (k_rrf + rank + 1)
        
        # –°–æ—Ä—Ç–∏—Ä–æ–≤–∫–∞ –ø–æ RRF score
        sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        return sorted_results
    
    def delete_documents(self, user_id: Optional[str] = None,
                        start_timestamp: Optional[float] = None,
                        end_timestamp: Optional[float] = None,
                        doc_ids: Optional[List[str]] = None):
        """
        –£–¥–∞–ª–µ–Ω–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ñ–∏–ª—å—Ç—Ä–∞–º
        """
        filters = None
        if user_id or start_timestamp is not None or end_timestamp is not None:
            filters = self._build_filters(user_id, start_timestamp, end_timestamp)
        
        # –ï—Å–ª–∏ –ø–µ—Ä–µ–¥–∞–Ω—ã –∫–æ–Ω–∫—Ä–µ—Ç–Ω—ã–µ ID, –∫–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤ UUID
        uuid_ids = None
        if doc_ids:
            uuid_ids = [self._generate_uuid(doc_id) for doc_id in doc_ids]
        
        # –£–¥–∞–ª–µ–Ω–∏–µ –∏–∑ –æ–±–µ–∏—Ö –±–∞–∑
        self.qdrant_embeddings.delete(
            collection_name=QDRANT_COLLECTION_EMBEDDINGS,
            points_selector=uuid_ids if uuid_ids else filters
        )
        
        self.qdrant_bm25.delete(
            collection_name=QDRANT_COLLECTION_BM25,
            points_selector=uuid_ids if uuid_ids else filters
        )
        
        # –ü–µ—Ä–µ—Å—á–µ—Ç BM25 –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è
        self.recalculate_bm25()
    
    def get_document_texts(self, doc_ids: List[str]) -> List[Tuple[str, str]]:
        """
        –ü–æ–ª—É—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–æ–≤ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ –∏—Ö ID
        """
        results = []
        # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ ID –≤ UUID –¥–ª—è –ø–æ–∏—Å–∫–∞
        uuid_ids = [self._generate_uuid(doc_id) for doc_id in doc_ids]
        
        points = self.qdrant_bm25.retrieve(
            collection_name=QDRANT_COLLECTION_BM25,
            ids=uuid_ids
        )
        
        for point in points:
            if point and point.payload:
                original_id = point.payload.get('original_id', point.id)
                results.append((original_id, point.payload['text']))
        return results

def run_mistral(messages, user_format=True, model="mistral-medium-latest", api_key=None):
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è –∑–∞–ø—Ä–æ—Å–æ–≤ –∫ Mistral API"""
    client = Mistral(api_key=api_key)
    if user_format:
        messages = [
            {"role": "user", "content": messages}
        ]
    chat_response = client.chat.complete(
        model=model,
        messages=messages
    )
    return chat_response.choices[0].message.content

def user_message(inquiry, messages=[]):
    """
    –§–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –ø—Ä–æ–º—Ç–∞ –¥–ª—è LLM
    """
    formatted_messages = "\n".join([f"<{{{msg[0]}, {msg[1]}}}>" for msg in messages]) if messages else "–ù–µ—Ç —Å–æ–æ–±—â–µ–Ω–∏–π."

    user_message = f"""
–†–û–õ–¨: –¢—ã –ø–æ–º–æ—â–Ω–∏–∫ –ø–æ –ø–æ–∏—Å–∫—É –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –≤ —á–∞—Ç–∞—Ö Telegram.
–ó–ê–î–ê–ß–ê: –ù–∞ –æ—Å–Ω–æ–≤–µ –∑–∞–ø—Ä–æ—Å–∞ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –∏ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã—Ö —Å–æ–æ–±—â–µ–Ω–∏–π —Å–æ—Å—Ç–∞–≤—å –æ—Ç–≤–µ—Ç.
–§–û–†–ú–ê–¢ –î–ê–ù–ù–´–•:
–ò—Å—Ç–æ—Ä–∏—è —Å–æ–æ–±—â–µ–Ω–∏–π –≤ —Ñ–æ—Ä–º–∞—Ç–µ:
<{{message_id, message_text}}>
–ö–∞–∂–¥–æ–µ —Å–æ–æ–±—â–µ–Ω–∏–µ –Ω–∞ –æ—Ç–¥–µ–ª—å–Ω–æ–π —Å—Ç—Ä–æ–∫–µ.

–ü–†–ê–í–ò–õ–ê:
- –ò—Å–ø–æ–ª—å–∑—É–π —Ç–æ–ª—å–∫–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã–µ —Å–æ–æ–±—â–µ–Ω–∏—è –¥–ª—è –æ—Ç–≤–µ—Ç–∞.
- –ï—Å–ª–∏ –Ω–µ—Ç –ø–æ–¥—Ö–æ–¥—è—â–µ–π –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏, –æ—Ç–≤–µ—Ç—å "–ò–∑–≤–∏–Ω–∏—Ç–µ, —è –Ω–µ —Å–º–æ–≥ –Ω–∞–π—Ç–∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –ø–æ –≤–∞—à–µ–º—É –∑–∞–ø—Ä–æ—Å—É."
- –ë—É–¥—å –∫—Ä–∞—Ç–æ–∫ –∏ —Ç–æ—á–µ–Ω.
- –ù–µ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –≤—Å–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã–µ —Å–æ–æ–±—â–µ–Ω–∏—è, –≤—ã–±–µ—Ä–∏ —Ç–æ–ª—å–∫–æ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ.

–§–û–†–ú–ê–¢ –û–¢–í–ï–¢–ê:
–û—Ç–≤–µ—Ç: [—Ç–µ–∫—Å—Ç –æ—Ç–≤–µ—Ç–∞]
–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–Ω—ã–µ —Å–æ–æ–±—â–µ–Ω–∏—è: <{{message_id1, message_id2, ...}}>

–ò—Å—Ç–æ—Ä–∏—è —Å–æ–æ–±—â–µ–Ω–∏–π:
{formatted_messages}

–ó–∞–ø—Ä–æ—Å –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è: {inquiry}
"""
    return user_message


–¢–ï–°–¢–´.

–∫–∞—Å–ø–æ–º –≤—ã–¥–µ–ª–∏–ª –Ω–∞ —á—Ç–æ –æ–±—Ä–∞—Ç–∏—Ç—å –≤–Ω–∏–º–∞–Ω–∏–µ 

In [43]:
# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
def main():
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Å–∏—Å—Ç–µ–º—ã
    #–ü–û–ú–ò–û–¢–†–ï–¢–¨ –ö–ê–ö –ò–ù–ò–¶–ò–ê–õ–ò–ó–ò–†–£–ï–¢–°–Ø –ë–î–®–öA (—á—Ç–æ–±—ã –∫–∞–∂–¥—ã–π —Ä–∞–∑ –Ω–µ –ø–µ—Ä–µ—Å–æ–∑–¥–∞–≤–∞—Ç—å)
    rag_system = TelegramRAGSystem(data_dir="./rag_data", mistral_api_key="your_mistral_api_key")
    
    # –î–æ–±–∞–≤–ª–µ–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    test_documents = [
        {
            'id': 'msg_001',
            'text': '–í—Å—Ç—Ä–µ—á–∞–µ–º—Å—è –∑–∞–≤—Ç—Ä–∞ –≤ 15:00 —É –≥–ª–∞–≤–Ω–æ–≥–æ –≤—Ö–æ–¥–∞',
            'user_id': 'user1',
            'timestamp': 1672531200
        },
        {
            'id': 'msg_002', 
            'text': '–ù–µ –∑–∞–±—É–¥—å—Ç–µ –≤–∑—è—Ç—å –¥–æ–∫—É–º–µ–Ω—Ç—ã –Ω–∞ —Å–æ–±—Ä–∞–Ω–∏–µ',
            'user_id': 'user2',
            'timestamp': 1672617600
        },
        {
            'id': 'msg_003',
            'text': '–û—Ç–º–µ–Ω–∞, –í—Å—Ç—Ä–µ—á–∞ –ø–µ—Ä–µ–Ω–µ—Å–µ–Ω–∞ —Å 15:00 –Ω–∞ —Å–ª–µ–¥—É—é—â–∏–π –ø–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫. —Ç–æ–≥–¥–∞ –∏ –∑–∞–∫–ª—é—á–∏–º —Å–¥–µ–ª–∫—É',
            'user_id': 'user1',
            'timestamp': 167200000
        },
        {
            'id': 'msg_004',
            'text': '–ö–∞–∫–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –Ω—É–∂–Ω—ã –¥–ª—è –≤—Å—Ç—Ä–µ—á–∏?',
            'user_id': 'user3',
            'timestamp': 1672790400
        },
        {
            'id': 'msg_005',
            'text': '–í—Å—Ç—Ä–µ—á–∞–µ–º—Å—è –≤–µ—á–µ—Ä–æ–º —É –∫–ª—É–±–∞',
            'user_id': 'user2',
            'timestamp': 1673000000
        },
        {
            'id': 'msg_006',
            'text': '–ò–¥–µ–º –≥—É–ª—è—Ç—å —Å–µ–≥–æ–¥–Ω—è –≤ –ø–∞—Ä–∫',
            'user_id': 'user1',
            'timestamp': 1672963200
        },


    ]
    rag_system.add_documents(test_documents)
    
    # –ü–û–°–ú–û–¢–†–ï–¢–¨ –§–û–†–ú–ê–¢–¨ –í–†–ï–ú–ï–ù–ò, –ü–û–î–¥–ï–†–ñ–ê–¢–¨ —Ñ–ò–õ–¢–†–ê–¶–∏—é –ü–û –í–†–ï–ú–ï–ù–ò, –£–ó–ï–†–£ (—Ñ—É–Ω–∫—Ü–∏—è –ø—Ä–µ–¥—É—Å–º–∞—Ç—Ä–∏–≤–∞–µ—Ç)
    results = rag_system.search("–∫–æ–≥–¥–∞ –≤—Å—Ç—Ä–µ—á–∞?", k=5, end_timestamp=1673000000-1)
    print("–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞:", results)
    
    # –í–û–ó–ú–û–ñ–ù–û –°–¢–û–ò–¢ –ó–ê–í–ï–°–¢–ò –û–¢–î–ï–õ–¨–ù–£–Æ –ë–î–®–ö–£ –ü–û–î –¢–ï–ö–°–¢–ê
    doc_ids = [doc_id for doc_id, score in results]
    messages = rag_system.get_document_texts(doc_ids)
    print("–ù–∞–π–¥–µ–Ω–Ω—ã–µ —Å–æ–æ–±—â–µ–Ω–∏—è:", messages)
    
    
    prompt = user_message("–≤–æ —Å–∫–æ–ª—å–∫–æ –∏–¥–µ–º —Ç—É—Å–∏—Ç—å?", messages)
    API_KEY = 'your_mistral_api_key'
    #–ü–û–î–î–ï–†–ñ–ê–¢–¨ –†–ê–ó–ù–´–ï –ú–û–î–ï–õ–ò 
    response = run_mistral(prompt, api_key=API_KEY, model="mistral-tiny")
    print("–û—Ç–≤–µ—Ç LLM:", response)

if __name__ == "__main__":
    main()

  embedding_results = self.qdrant_embeddings.search(


–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ–∏—Å–∫–∞: [('msg_004', 0.03278688524590164), ('msg_003', 0.0315136476426799), ('msg_006', 0.016129032258064516), ('msg_001', 0.015873015873015872), ('msg_002', 0.015625)]
–ù–∞–π–¥–µ–Ω–Ω—ã–µ —Å–æ–æ–±—â–µ–Ω–∏—è: [('msg_004', '–ö–∞–∫–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –Ω—É–∂–Ω—ã –¥–ª—è –≤—Å—Ç—Ä–µ—á–∏?'), ('msg_003', '–û—Ç–º–µ–Ω–∞, –í—Å—Ç—Ä–µ—á–∞ –ø–µ—Ä–µ–Ω–µ—Å–µ–Ω–∞ —Å 15:00 –Ω–∞ —Å–ª–µ–¥—É—é—â–∏–π –ø–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫. —Ç–æ–≥–¥–∞ –∏ –∑–∞–∫–ª—é—á–∏–º —Å–¥–µ–ª–∫—É'), ('msg_006', '–ò–¥–µ–º –≥—É–ª—è—Ç—å —Å–µ–≥–æ–¥–Ω—è –≤ –ø–∞—Ä–∫'), ('msg_001', '–í—Å—Ç—Ä–µ—á–∞–µ–º—Å—è –∑–∞–≤—Ç—Ä–∞ –≤ 15:00 —É –≥–ª–∞–≤–Ω–æ–≥–æ –≤—Ö–æ–¥–∞'), ('msg_002', '–ù–µ –∑–∞–±—É–¥—å—Ç–µ –≤–∑—è—Ç—å –¥–æ–∫—É–º–µ–Ω—Ç—ã –Ω–∞ —Å–æ–±—Ä–∞–Ω–∏–µ')]
–û—Ç–≤–µ—Ç LLM: –û—Ç–≤–µ—Ç: –°–µ–≥–æ–¥–Ω—è –º—ã –∏–¥–µ–º –≥—É–ª—è—Ç—å –≤ 15:00.
–ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–Ω—ã–µ —Å–æ–æ–±—â–µ–Ω–∏—è: <{msg_004, –ö–∞–∫–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –Ω—É–∂–Ω—ã –¥–ª—è –≤—Å—Ç—Ä–µ—á–∏?}>
<{msg_006, –ò–¥–µ–º –≥—É–ª—è—Ç—å —Å–µ–≥–æ–¥–Ω—è –≤ –ø–∞—Ä–∫}>


–ú–æ–∂–Ω–æ –ø—Ä–æ–≤–µ—Ä–∏—Ç—å –ø–æ–¥–∫–ª—é—á–µ–Ω–∏–µ –∫ –º–∏—Å—Ç—Ä–∞–ª—é. –≤—ã–≤–æ–¥—è—Ç—Å—è –≤—Å–µ –¥–æ—Å—Ç—É–ø–Ω—ã–µ –º–æ–¥–µ–ª–∏.


In [None]:
# Access the secret
api_key = API_KEY

if not api_key:
    raise ValueError("‚ùå MISTRAL_API_KEY not found in Colab secrets!")
else:
    print("‚úÖ API key loaded successfully from Colab secrets!")

# Initialize Mistral client
client = Mistral(api_key=api_key)

# Test connection
def test_connection():
    try:
        models = client.models.list()
        print("‚úÖ Connected successfully!")
        print(f"Available models: {[m.id for m in models.data]}")
    except Exception as e:
        print(f"‚ùå Connection failed: {e}")
        print("üí° If key is not active yet, wait a few minutes and try again")

test_connection()