In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1. Gi·∫£ l·∫≠p m·ªôt vƒÉn b·∫£n d√†i (V√≠ d·ª•: T√†i li·ªáu h∆∞·ªõng d·∫´n)
long_text = """
Docker l√† m·ªôt n·ªÅn t·∫£ng m·ªü ƒë·ªÉ ph√°t tri·ªÉn, v·∫≠n chuy·ªÉn v√† ch·∫°y c√°c ·ª©ng d·ª•ng. 
Docker cho ph√©p b·∫°n t√°ch ·ª©ng d·ª•ng kh·ªèi c∆° s·ªü h·∫° t·∫ßng ƒë·ªÉ b·∫°n c√≥ th·ªÉ ph√¢n ph·ªëi ph·∫ßn m·ªÅm nhanh ch√≥ng. 
V·ªõi Docker, b·∫°n c√≥ th·ªÉ qu·∫£n l√Ω c∆° s·ªü h·∫° t·∫ßng c·ªßa m√¨nh gi·ªëng nh∆∞ c√°ch b·∫°n qu·∫£n l√Ω c√°c ·ª©ng d·ª•ng.
Qdrant l√† m·ªôt c∆° s·ªü d·ªØ li·ªáu vector. N√≥ ƒë∆∞·ª£c t·ªëi ∆∞u h√≥a ƒë·ªÉ l∆∞u tr·ªØ v√† truy v·∫•n c√°c vector embedding.
"""

# 2. Kh·ªüi t·∫°o b·ªô chia (Splitter)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,      # K√≠ch th∆∞·ªõc m·ªói ƒëo·∫°n (k√Ω t·ª±)
    chunk_overlap=20,    # ƒê·ªô ch·ªìng l·∫•n (ƒë·ªÉ gi·ªØ ng·ªØ c·∫£nh gi·ªØa c√°c ƒëo·∫°n c·∫Øt)
    length_function=len,
)

# 3. Th·ª±c hi·ªán c·∫Øt
docs = text_splitter.create_documents([long_text])

print(f"ƒê√£ c·∫Øt th√†nh {len(docs)} ƒëo·∫°n.")
print(f"V√≠ d·ª• ƒëo·∫°n 1: {docs[1].page_content}")

ƒê√£ c·∫Øt th√†nh 4 ƒëo·∫°n.
V√≠ d·ª• ƒëo·∫°n 1: Docker cho ph√©p b·∫°n t√°ch ·ª©ng d·ª•ng kh·ªèi c∆° s·ªü h·∫° t·∫ßng ƒë·ªÉ b·∫°n c√≥ th·ªÉ ph√¢n ph·ªëi ph·∫ßn m·ªÅm nhanh ch√≥ng.


In [6]:
from sentence_transformers import SentenceTransformer

# 1. T·∫£i model (all-MiniLM-L6-v2 l√† model nh·ªè, nhanh, ph·ªï bi·∫øn cho demo)
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. L·∫•y n·ªôi dung text t·ª´ c√°c ƒëo·∫°n ƒë√£ c·∫Øt ·ªü b∆∞·ªõc 1
texts = [d.page_content for d in docs]

# 3. Bi·∫øn ƒë·ªïi th√†nh vector
embeddings = model.encode(texts)

# Ki·ªÉm tra k√≠ch th∆∞·ªõc vector (Th∆∞·ªùng l√† 384 chi·ªÅu v·ªõi model n√†y)
vector_size = len(embeddings[1]) 
print(f"K√≠ch th∆∞·ªõc vector: {vector_size}")

K√≠ch th∆∞·ªõc vector: 384


In [7]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
import uuid

# 1. Kh·ªüi t·∫°o Qdrant (D√πng ":memory:" ƒë·ªÉ ch·∫°y tr√™n RAM, kh√¥ng c·∫ßn c√†i server)
client = QdrantClient(":memory:")

# 2. T·∫°o Collection (gi·ªëng nh∆∞ t·∫°o B·∫£ng trong SQL)
collection_name = "my_knowledge_base"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)

# 3. ƒê∆∞a d·ªØ li·ªáu v√†o Qdrant
points = []
for idx, (text, vector) in enumerate(zip(texts, embeddings)):
    points.append(
        PointStruct(
            id=idx,  # ID c√≥ th·ªÉ l√† s·ªë ho·∫∑c UUID
            vector=vector.tolist(),
            payload={"text": text}  # L∆∞u l·∫°i text g·ªëc ƒë·ªÉ t√≠ n·ªØa l·∫•y ra ƒë·ªçc
        )
    )

client.upsert(
    collection_name=collection_name,
    points=points
)
print("ƒê√£ l∆∞u d·ªØ li·ªáu v√†o Qdrant th√†nh c√¥ng!")

ƒê√£ l∆∞u d·ªØ li·ªáu v√†o Qdrant th√†nh c√¥ng!


  client.recreate_collection(


In [8]:
# 1. C√¢u h·ªèi c·ªßa ng∆∞·ªùi d√πng
query = "Docker d√πng ƒë·ªÉ l√†m g√¨?"

# 2. M√£ h√≥a c√¢u h·ªèi th√†nh vector (d√πng chung model v·ªõi l√∫c n·∫°p d·ªØ li·ªáu)
query_vector = model.encode(query).tolist()

# 3. T√¨m ki·∫øm trong Qdrant
search_result = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=2  # L·∫•y 2 k·∫øt qu·∫£ t·ªët nh·∫•t
)

# 4. In k·∫øt qu·∫£
print(f"\nC√¢u h·ªèi: {query}")
print("-" * 30)
for result in search_result:
    print(f"ƒê·ªô t∆∞∆°ng ƒë·ªìng: {result.score:.4f}")
    print(f"N·ªôi dung t√¨m th·∫•y: {result.payload['text']}")
    print("-" * 30)


C√¢u h·ªèi: Docker d√πng ƒë·ªÉ l√†m g√¨?
------------------------------
ƒê·ªô t∆∞∆°ng ƒë·ªìng: 0.6241
N·ªôi dung t√¨m th·∫•y: Docker l√† m·ªôt n·ªÅn t·∫£ng m·ªü ƒë·ªÉ ph√°t tri·ªÉn, v·∫≠n chuy·ªÉn v√† ch·∫°y c√°c ·ª©ng d·ª•ng.
------------------------------
ƒê·ªô t∆∞∆°ng ƒë·ªìng: 0.5813
N·ªôi dung t√¨m th·∫•y: Docker cho ph√©p b·∫°n t√°ch ·ª©ng d·ª•ng kh·ªèi c∆° s·ªü h·∫° t·∫ßng ƒë·ªÉ b·∫°n c√≥ th·ªÉ ph√¢n ph·ªëi ph·∫ßn m·ªÅm nhanh ch√≥ng.
------------------------------


In [12]:
print(f"M√≥n ƒÉn: 'name_vn' ('name_en'). "
f"Thu·ªôc lo·∫°i: cat_name. "
f"Gi√°: 'price' VND. "
f"Ghi ch√∫: 'note'.")

M√≥n ƒÉn: 'name_vn' ('name_en'). Thu·ªôc lo·∫°i: cat_name. Gi√°: 'price' VND. Ghi ch√∫: 'note'.


In [14]:
import json
import os
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

# --- PH·∫¶N 1: GI·∫¢ L·∫¨P M√îI TR∆Ø·ªúNG & D·ªÆ LI·ªÜU ---
# 1. T·∫°o Config gi·∫£
class Config:
    EMBEDDING_MODEL = 'all-MiniLM-L6-v2' # Model nh·ªè ch·∫°y cho nhanh
    COLLECTION_NAME = 'demo_collection'
    DATA_DIR = '.' # L∆∞u file t·∫°m ngay th∆∞ m·ª•c hi·ªán t·∫°i

# 2. T·∫°o file menu.json gi·∫£
sample_menu = [
    {
        "category": "M√≥n Ch√≠nh",
        "items": [
            {"id": "mc_01", "name_vn": "Ph·ªü B√≤", "name_en": "Beef Noodle Soup", "price": 50000, "note": "ƒê·∫∑c bi·ªát th∆°m ngon"},
            {"id": "mc_02", "name_vn": "B√∫n Ch·∫£", "name_en": "Grilled Pork Noodles", "price": 60000, "note": "N∆∞·ªõc ch·∫•m gia truy·ªÅn"}
        ]
    }
]
with open('menu.json', 'w', encoding='utf-8') as f:
    json.dump(sample_menu, f, ensure_ascii=False, indent=2)

# 3. T·∫°o file restaurant_info.txt gi·∫£
sample_info = """
Nh√† h√†ng m·ªü c·ª≠a t·ª´ 8:00 s√°ng ƒë·∫øn 10:00 t·ªëi.
Ch√∫ng t√¥i mi·ªÖn ph√≠ g·ª≠i xe cho kh√°ch h√†ng.
"""
with open('restaurant_info.txt', 'w', encoding='utf-8') as f:
    f.write(sample_info)

print("‚úÖ ƒê√£ t·∫°o xong d·ªØ li·ªáu m·∫´u (menu.json, restaurant_info.txt)")
print("-" * 50)

# --- PH·∫¶N 2: LOGIC C·ª¶A B·∫†N (C√ì TH√äM PRINT ƒê·ªÇ SOI D·ªÆ LI·ªÜU) ---

class DataIngestor:
    def __init__(self, qdrant_client):
        self.client = qdrant_client
        print(f"‚è≥ ƒêang t·∫£i model {Config.EMBEDDING_MODEL}...")
        self.encoder = SentenceTransformer(Config.EMBEDDING_MODEL)
        print("‚úÖ Model ƒë√£ s·∫µn s√†ng!\n")
        
    def load_menu(self, path):
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            print(f"üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc menu.json: T√¨m th·∫•y {len(data)} danh m·ª•c.")
            
            docs = []
            for category in data:
                cat_name = category['category']
                for item in category['items']:
                    # T·∫°o n·ªôi dung ng·ªØ nghƒ©a
                    content = (
                        f"M√≥n ƒÉn: {item['name_vn']} ({item['name_en']}). "
                        f"Thu·ªôc lo·∫°i: {cat_name}. "
                        f"Gi√°: {item['price']} VND. "
                        f"Ghi ch√∫: {item['note']}."
                    )
                    
                    # IN RA ƒê·ªÇ KI·ªÇM TRA
                    print(f"   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: \"{content}\"")
                    
                    docs.append({"text": content, "source": "menu", "id": item['id']})
            return docs
        except FileNotFoundError:
            print("Warning: Menu file not found.")
            return []

    def load_info(self, path):
        try:
            with open(path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Chia nh·ªè
            chunks = content.split('\n')
            docs = []
            print(f"üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc info.txt.")
            
            for i, chunk in enumerate(chunks):
                if chunk.strip():
                    # IN RA ƒê·ªÇ KI·ªÇM TRA
                    print(f"   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] Raw Text -> Chunk: \"{chunk.strip()}\"")
                    docs.append({"text": chunk.strip(), "source": "info", "id": f"info_{i}"})
            return docs
        except FileNotFoundError:
            print("Warning: Info file not found.")
            return []

    def ingest(self):
        print("--- B·∫Øt ƒë·∫ßu n·∫°p d·ªØ li·ªáu ---")
        menu_docs = self.load_menu(os.path.join(Config.DATA_DIR, 'menu.json'))
        info_docs = self.load_info(os.path.join(Config.DATA_DIR, 'restaurant_info.txt'))
        
        all_docs = menu_docs + info_docs
        
        if not all_docs:
            print("Kh√¥ng c√≥ d·ªØ li·ªáu.")
            return

        # T·∫°o collection (RAM)
        self.client.recreate_collection(
            collection_name=Config.COLLECTION_NAME,
            vectors_config=VectorParams(size=384, distance=Distance.COSINE),
        )

        print(f"\n‚ö° [Giai ƒëo·∫°n 3 - Vector h√≥a] ƒêang bi·∫øn ƒë·ªïi {len(all_docs)} ƒëo·∫°n vƒÉn th√†nh s·ªë...")
        embeddings = self.encoder.encode([d['text'] for d in all_docs])
        
        # In th·ª≠ vector ƒë·∫ßu ti√™n xem m·∫∑t m≈©i n√≥ ra sao
        print(f"   üëâ Vector m·∫´u (3 s·ªë ƒë·∫ßu/384): {embeddings[0][:3]} ...")

        points = []
        for i, doc in enumerate(all_docs):
            points.append(PointStruct(
                id=i, # L∆∞u √Ω: Code g·ªëc c·ªßa b·∫°n d√πng i l√†m ID, c·∫©n th·∫≠n tr√πng n·∫øu ch·∫°y nhi·ªÅu l·∫ßn
                vector=embeddings[i].tolist(),
                payload=doc
            ))
            
        print(f"\nüì¶ [Giai ƒëo·∫°n 4 - ƒê√≥ng g√≥i] Chu·∫©n b·ªã ƒë·∫©y {len(points)} Points v√†o Qdrant.")
        
        self.client.upsert(
            collection_name=Config.COLLECTION_NAME,
            points=points
        )
        print(f"‚úÖ [HO√ÄN T·∫§T] ƒê√£ n·∫°p th√†nh c√¥ng v√†o Qdrant Memory!")

# --- PH·∫¶N 3: CH·∫†Y TH·ª¨ ---
if __name__ == "__main__":
    # D√πng Qdrant ch·∫°y tr√™n RAM (kh√¥ng c·∫ßn c√†i Docker ƒë·ªÉ test)
    client = QdrantClient(":memory:")
    
    ingestor = DataIngestor(client)
    ingestor.ingest()
    
    # Test th·ª≠ t√¨m ki·∫øm ƒë·ªÉ ch·ª©ng minh d·ªØ li·ªáu ƒë√£ v√†o
    print("\n--- üîé TEST T√åM KI·∫æM ---")
    query = "Nh√† h√†ng b√°n m√≥n g√¨ 50000?"
    print(f"C√¢u h·ªèi: {query}")
    
    model = SentenceTransformer(Config.EMBEDDING_MODEL)
    hits = client.search(
        collection_name=Config.COLLECTION_NAME,
        query_vector=model.encode(query).tolist(),
        limit=1
    )
    
    for hit in hits:
        print(f"K·∫øt qu·∫£ t√¨m th·∫•y: {hit.payload['text']}")

‚úÖ ƒê√£ t·∫°o xong d·ªØ li·ªáu m·∫´u (menu.json, restaurant_info.txt)
--------------------------------------------------
‚è≥ ƒêang t·∫£i model all-MiniLM-L6-v2...
‚úÖ Model ƒë√£ s·∫µn s√†ng!

--- B·∫Øt ƒë·∫ßu n·∫°p d·ªØ li·ªáu ---
üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc menu.json: T√¨m th·∫•y 1 danh m·ª•c.
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: "M√≥n ƒÉn: Ph·ªü B√≤ (Beef Noodle Soup). Thu·ªôc lo·∫°i: M√≥n Ch√≠nh. Gi√°: 50000 VND. Ghi ch√∫: ƒê·∫∑c bi·ªát th∆°m ngon."
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] JSON -> Text: "M√≥n ƒÉn: B√∫n Ch·∫£ (Grilled Pork Noodles). Thu·ªôc lo·∫°i: M√≥n Ch√≠nh. Gi√°: 60000 VND. Ghi ch√∫: N∆∞·ªõc ch·∫•m gia truy·ªÅn."
üëÄ [Giai ƒëo·∫°n 1 - ƒê·ªçc File] ƒê√£ ƒë·ªçc info.txt.
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] Raw Text -> Chunk: "Nh√† h√†ng m·ªü c·ª≠a t·ª´ 8:00 s√°ng ƒë·∫øn 10:00 t·ªëi."
   üëâ [Giai ƒëo·∫°n 2 - Bi·∫øn ƒë·ªïi] Raw Text -> Chunk: "Ch√∫ng t√¥i mi·ªÖn ph√≠ g·ª≠i xe cho kh√°ch h√†ng."

‚ö° [Giai ƒëo·∫°n 3 - Vec

  self.client.recreate_collection(


K·∫øt qu·∫£ t√¨m th·∫•y: Nh√† h√†ng m·ªü c·ª≠a t·ª´ 8:00 s√°ng ƒë·∫øn 10:00 t·ªëi.
