In [5]:
import json
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from config import Config

In [6]:
class DataIngestor:
    def __init__(self, qdrant_client):
        self.client = qdrant_client
        print(f"‚è≥ ƒêang t·∫£i model {Config.EMBEDDING_MODEL}...")
        self.encoder = SentenceTransformer(Config.EMBEDDING_MODEL)
        print("‚úÖ Model ƒë√£ s·∫µn s√†ng!\n")
    
    def load_menu(self, path):
        """Load d·ªØ li·ªáu menu m√≥n ƒÉn c·ªßa nh√† h√†ng"""
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            print(f"üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc menu.json: T√¨m th·∫•y {len(data)} danh m·ª•c.")
            
            docs = []
            for category in data:
                cat_name = category['category']
                for item in category['items']:
                    # T·∫°o n·ªôi dung ng·ªØ nghƒ©a cho vector search
                    content = (
                        f"M√≥n ƒÉn {item['name_vn']} ({item['name_en']}). "
                        f"Thu·ªôc lo·∫°i {cat_name}. "
                        f"Gi√° {item['price']} VND. "
                        f"Ghi ch√∫ {item['note']}."
                    )
                    print(f"   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: \"{content}\"")
                    docs.append({"text": content, "id": item['id'], "source": "menu"})
            return docs
        except FileNotFoundError:
            print("Warning: Menu file not found.")
            return []
    
    def load_info(self, path):
        """Load d·ªØ li·ªáu th√¥ng tin c·ªßa nh√† h√†ng"""
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = f.read()
            # Chia nh·ªè th√¥ng tin nh√† h√†ng th√†nh c√°c ƒëo·∫°n
            chunks = data.split('\n')
            docs = []
            print(f"üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc info.txt.")
            for i, chunk in enumerate(chunks):
                if chunk.strip():
                    print(f"   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] Raw Text -> Chunk: \"{chunk.strip()}\"")
                    docs.append({"text": chunk.strip(), "source": "info", "id": f"info_{i}"})
            return docs
        except FileNotFoundError:
            print("Warning: Info file not found.")
            return []
    
    def ingest(self):
        """N·∫°p d·ªØ li·ªáu v√†o Vector DB"""
        print("--- B·∫Øt ƒë·∫ßu n·∫°p d·ªØ li·ªáu v√†o Vector DB ---")
        menu_data = self.load_menu(os.path.join(Config.DATA_DIR, "menu.json"))
        print(os.path.join(Config.DATA_DIR, "menu.json"))
        print(Config.BASE_DIR)
        info_data = self.load_info(os.path.join(Config.DATA_DIR, 'restaurant_info.txt'))
        print(os.path.join(Config.DATA_DIR, "restaurant_info.txt"))
        
        all_docs = menu_data + info_data
        
        if not all_docs: 
            print("Kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªÉ n·∫°p.")
            return
        
        # T·∫°o collection trong Qdrant
        if self.client.collection_exists(collection_name=Config.COLLECTION_NAME):
            self.client.delete_collection(collection_name=Config.COLLECTION_NAME)
            
        self.client.create_collection(
            collection_name=Config.COLLECTION_NAME,
            vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
        )
        print(f"\n‚ö° [Giai ƒëo·∫°n 3 - Vector h√≥a] ƒêang bi·∫øn ƒë·ªïi {len(all_docs)} ƒëo·∫°n vƒÉn th√†nh s·ªë...")
        
        # Vector h√≥a v√† upload
        points = []
        embeddings = self.encoder.encode([d['text'] for d in all_docs])
        print(f"   üëâ Vector m·∫´u (3 s·ªë ƒë·∫ßu/384): {embeddings[0]} ... {embeddings.shape}")
        # Loop t·∫°o PointStruct (k·∫øt h·ª£p ID + Vector + Payload)
        for i, doc in enumerate(all_docs):
            points.append(PointStruct(
                id=i,
                vector=embeddings[i].tolist(),
                payload=doc
            ))
        
        print(f"\nüì¶ [Giai ƒëo·∫°n 4 - ƒê√≥ng g√≥i] Chu·∫©n b·ªã ƒë·∫©y {len(points)} Points v√†o Qdrant {points}.")
        # Client.upsert -> ƒê·∫©y l√™n Qdrant
        self.client.upsert(
            collection_name=Config.COLLECTION_NAME,
            points=points
        )
        print(f"--- ƒê√£ n·∫°p {len(points)} documents v√†o Qdrant ---")

In [7]:
if __name__ == "__main__":
    # D√πng Qdrant ch·∫°y tr√™n RAM (kh√¥ng c·∫ßn c√†i Docker ƒë·ªÉ test)
    Client = QdrantClient(":memory:")
    
    ingestor = DataIngestor(Client)
    ingestor.ingest()
    
    # Test th·ª≠ t√¨m ki·∫øm ƒë·ªÉ ch·ª©ng minh d·ªØ li·ªáu ƒë√£ v√†o
    print("\n--- üîé TEST T√åM KI·∫æM ---")
    query = "T√¥i mu·ªën ƒÉn m√≥n g√¨ ƒë√≥ cay cay"
    print(f"C√¢u h·ªèi: {query}")
    
    model = SentenceTransformer(Config.EMBEDDING_MODEL)
    hits = Client.search(
        collection_name=Config.COLLECTION_NAME,
        query_vector=model.encode(query).tolist(),
        limit=2
    )
    
    for hit in hits:
        print(f"K·∫øt qu·∫£ t√¨m th·∫•y: {hit.payload['text']}")
    

‚è≥ ƒêang t·∫£i model AITeamVN/Vietnamese_Embedding...
‚úÖ Model ƒë√£ s·∫µn s√†ng!

--- B·∫Øt ƒë·∫ßu n·∫°p d·ªØ li·ªáu v√†o Vector DB ---
üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc menu.json: T√¨m th·∫•y 6 danh m·ª•c.
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: "M√≥n ƒÉn G·ªèi g√† V√¢n Nam (Yunnan Shredded Chicken Salad). Thu·ªôc lo·∫°i Khai v·ªã (Appetizer). Gi√° 138000 VND. Ghi ch√∫ Cay/Spicy."
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: "M√≥n ƒÉn ƒê·∫≠u h≈© ho√†ng kim (Deep Fried Tofu with Salted Egg Yolk). Thu·ªôc lo·∫°i Khai v·ªã (Appetizer). Gi√° 88000 VND. Ghi ch√∫ Chef's Signature."
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: "M√≥n ƒÉn C√†ng cua b√°ch hoa (Bai Hua Crab Claw). Thu·ªôc lo·∫°i Khai v·ªã (Appetizer). Gi√° 205000 VND. Ghi ch√∫ Chef's Signature."
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: "M√≥n ƒÉn Nem cu·ªën t·ª© v·ªã (Spring Rolls With Four Flavors). Thu·ªôc lo·∫°i Khai v·ªã (Appetizer). Gi√° 130000 VND. Ghi ch√∫ ."