In [1]:
import json
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from config import Config

  from tqdm.autonotebook import tqdm, trange


In [2]:
class DataIngestor:
    def __init__(self, qdrant_client):
        self.client = qdrant_client
        print(f"‚è≥ ƒêang t·∫£i model {Config.EMBEDDING_MODEL}...")
        self.encoder = SentenceTransformer(Config.EMBEDDING_MODEL)
        print("‚úÖ Model ƒë√£ s·∫µn s√†ng!\n")
    
    def load_menu(self, path):
        """Load d·ªØ li·ªáu info v√† menu m√≥n ƒÉn c·ªßa nh√† h√†ng"""
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            print(f"üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc menu.json: T√¨m th·∫•y {len(data)} danh m·ª•c.")
            docs = []
            
            # 1. X·ª≠ l√Ω th√¥ng tin chung nh√† h√†ng (Restaurant Info)
            info = data['restaurant']
            info_text = (
                f"Th√¥ng tin nh√† h√†ng {info['name']} ({info['name_en']}). "
                f"ƒê·ªãa ch·ªâ: {info['contact']['address']}. "
                f"Gi·ªù m·ªü c·ª≠a: {info['business_hours']['display']}. "
                f"SƒêT: {info['contact']['phone']}. "
                f"M√¥ t·∫£: {info['description']}."
            )
            docs.append({"text": info_text, "source": "info", "id": "rest_info"})
            
            # 2. X·ª≠ l√Ω Menu (Quan tr·ªçng: Gh√©p description v√† tags v√†o text)
            for category in data['menu']['categories']:
                cat_name = category['name_vn']
                for item in category['items']:
                    # T·∫°o chu·ªói vƒÉn b·∫£n gi√†u ng·ªØ nghƒ©a cho Vector Embedding
                    # K·ªπ thu·∫≠t: Gh√©p h·∫øt c√°c tr∆∞·ªùng quan tr·ªçng v√†o 1 ƒëo·∫°n vƒÉn
                    content = (
                        f"M√≥n: {item['name_vn']} ({item['name_en']}). "
                        f"Lo·∫°i: {cat_name}. "
                        f"Gi√°: {item['price']} VND. "
                        f"M√¥ t·∫£ h∆∞∆°ng v·ªã: {item.get('description', '')}. " # L·∫•y m√¥ t·∫£ t·ª´ V2
                        f"ƒê·∫∑c ƒëi·ªÉm: {', '.join(item.get('tags', []))}. "   # L·∫•y tags (spicy, seafood...)
                        f"Th·ªùi gian chu·∫©n b·ªã: {item.get('preparation_time', 0)} ph√∫t."
                    )
                    
                    # L∆∞u metadata ƒë·ªÉ sau n√†y filter n·∫øu c·∫ßn
                    metadata = {
                        "text": content,
                        "source": "menu",
                        "id": item['id'],
                        "price": item['price'],
                        "category": cat_name,
                        "tags": item.get('tags', [])
                    }
                    docs.append(metadata)

            # 3. X·ª≠ l√Ω FAQ (Common Questions)
            for idx, qa in enumerate(data.get('common_questions', [])):
                qa_text = f"H·ªèi: {qa['question']} - Tr·∫£ l·ªùi: {qa['answer']}"
                docs.append({"text": qa_text, "source": "faq", "id": f"faq_{idx}"})
                
            return docs

        except FileNotFoundError:
            print("Error: Menu file not found.")
            return []

    def ingest(self):
        """N·∫°p d·ªØ li·ªáu v√†o Vector DB"""
        print("--- B·∫Øt ƒë·∫ßu n·∫°p d·ªØ li·ªáu v√†o Vector DB ---")
        menu_data = self.load_menu(os.path.join(Config.DATA_DIR, "menu_v2.json"))
        print(os.path.join(Config.DATA_DIR, "menu_v2.json"))
        print(Config.BASE_DIR)
        
        if not menu_data: 
            print("Kh√¥ng c√≥ d·ªØ li·ªáu ƒë·ªÉ n·∫°p.")
            return
        
        # T·∫°o collection trong Qdrant
        if self.client.collection_exists(collection_name=Config.COLLECTION_NAME):
            self.client.delete_collection(collection_name=Config.COLLECTION_NAME)
            
        self.client.create_collection(
            collection_name=Config.COLLECTION_NAME,
            vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
        )
        print(f"\n‚ö° [Giai ƒëo·∫°n 3 - Vector h√≥a] ƒêang bi·∫øn ƒë·ªïi {len(menu_data)} ƒëo·∫°n vƒÉn th√†nh s·ªë...")
        
        # Vector h√≥a v√† upload
        points = []
        embeddings = self.encoder.encode([d['text'] for d in menu_data])
        print(f"   üëâ Vector m·∫´u (3 s·ªë ƒë·∫ßu/384): {embeddings[0]} ... {embeddings.shape}")
        # Loop t·∫°o PointStruct (k·∫øt h·ª£p ID + Vector + Payload)
        for i, doc in enumerate(menu_data):
            points.append(PointStruct(
                id=i,
                vector=embeddings[i].tolist(),
                payload=doc
            ))
        
        print(f"\nüì¶ [Giai ƒëo·∫°n 4 - ƒê√≥ng g√≥i] Chu·∫©n b·ªã ƒë·∫©y {len(points)} Points v√†o Qdrant {points}.")
        # Client.upsert -> ƒê·∫©y l√™n Qdrant
        self.client.upsert(
            collection_name=Config.COLLECTION_NAME,
            points=points
        )
        print(f"--- ƒê√£ n·∫°p {len(points)} documents v√†o Qdrant ---")

In [3]:
if __name__ == "__main__":
    # D√πng Qdrant ch·∫°y tr√™n RAM (kh√¥ng c·∫ßn c√†i Docker ƒë·ªÉ test)
    Client = QdrantClient(":memory:")
    
    ingestor = DataIngestor(Client)
    ingestor.ingest()
    
    # Test th·ª≠ t√¨m ki·∫øm ƒë·ªÉ ch·ª©ng minh d·ªØ li·ªáu ƒë√£ v√†o
    print("\n--- üîé TEST T√åM KI·∫æM ---")
    query = "T√¥i mu·ªën ƒÉn m√≥n g√¨ ƒë√≥ cay cay"
    print(f"C√¢u h·ªèi: {query}")
    
    model = SentenceTransformer(Config.EMBEDDING_MODEL)
    hits = Client.search(
        collection_name=Config.COLLECTION_NAME,
        query_vector=model.encode(query).tolist(),
        limit=2
    )
    
    for hit in hits:
        print(f"K·∫øt qu·∫£ t√¨m th·∫•y: {hit.payload['text']}")
    

‚è≥ ƒêang t·∫£i model AITeamVN/Vietnamese_Embedding...
‚úÖ Model ƒë√£ s·∫µn s√†ng!

--- B·∫Øt ƒë·∫ßu n·∫°p d·ªØ li·ªáu v√†o Vector DB ---
üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc menu.json: T√¨m th·∫•y 4 danh m·ª•c.
d:\2_Workspace\4_Nam_4\1_Hoc_ky_251\4_NLP\4_Project\Hoa-Vien-Restaurant-Chat-App\python\data\menu_v2.json
d:\2_Workspace\4_Nam_4\1_Hoc_ky_251\4_NLP\4_Project\Hoa-Vien-Restaurant-Chat-App\python\src

‚ö° [Giai ƒëo·∫°n 3 - Vector h√≥a] ƒêang bi·∫øn ƒë·ªïi 27 ƒëo·∫°n vƒÉn th√†nh s·ªë...
   üëâ Vector m·∫´u (3 s·ªë ƒë·∫ßu/384): [ 0.03455431 -0.00976736 -0.02488177 ...  0.0340477  -0.02233206
 -0.00674082] ... (27, 1024)

üì¶ [Giai ƒëo·∫°n 4 - ƒê√≥ng g√≥i] Chu·∫©n b·ªã ƒë·∫©y 27 Points v√†o Qdrant [PointStruct(id=0, vector=[0.03455431014299393, -0.009767363779246807, -0.024881767109036446, 0.012239749543368816, -0.06192198395729065, 0.060047343373298645, -0.005188875366002321, 0.010136207565665245, -0.0027340776287019253, -0.022432154044508934, 0.00784455705434084, -0.0