In [36]:
from openai import OpenAI
from dashvector import Client as DashClient, Doc
from tqdm import tqdm
import os

# === Setup clients ===
client = OpenAI(
    api_key="sk-91d59f4cf5ff44cf9f8280a91dacbb1c",
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
)

dashvector_client = DashClient(
    api_key="sk-eAO7D10gZP2kzhjeKmiI7b6bOHTmoD6B9CAC6318F11F097B99A92126D810D",
    endpoint="vrs-sg-l0z49hq250003n.dashvector.ap-southeast-1.aliyuncs.com"
)

In [37]:
from docx import Document

def load_docx_chunks(path, chunk_size=300):
    doc = Document(path)
    full_text = []

    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            full_text.append(text)

    # Optional: join and chunk by characters or sentences
    combined_text = "\n".join(full_text)

    # Chunk into fixed size (you can also use sentence split here)
    chunks = [combined_text[i:i+chunk_size] for i in range(0, len(combined_text), chunk_size)]
    return chunks


In [38]:
from nltk.tokenize import sent_tokenize  # Example for sentence-level chunking
def load_docx_chunks(path):
    doc = Document(path)
    text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
    sentences = sent_tokenize(text)
    # Join every 3 sentences into a single string
    return [" ".join(sentences[i:i+3]) for i in range(0, len(sentences), 3)]

In [39]:
chunks = load_docx_chunks("insurance_policy_company_A.docx")

In [40]:
chunks

['ABC Motor Insurance Company\nDummy Internal Policy Document: Motor Accident Claims\n1. Introduction\nABC Motor Insurance Company provides comprehensive motor insurance coverage to eligible policyholders. This document outlines key policy terms, coverage limits, claim procedures, and decision criteria to support the Retrieval-Augmented Generation (RAG) chatbot in delivering accurate, grounded responses to user inquiries.',
 '2. Definitions\nPolicyholder: The individual or entity named on the insurance policy. Insured Vehicle: The registered vehicle covered under the policy.',
 'At-Fault Collision: Accident where the insured is responsible for damages. Third-Party: Any other party involved in the accident. Coverage Limit: Maximum amount payable by the Company for a claim.',
 'Deductible: The portion of repair costs the policyholder must bear before Company payment. 3. Eligibility & Policy Conditions\nPolicies must be active (non-expired) at the time of the accident.',
 'Vehicles older 

In [41]:
# === Helper: batching ===
def batch_list(lst, n=10):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

all_embeddings = []
for chunk in tqdm(chunks, desc="Embedding chunks"):
    response = client.embeddings.create(
        model="text-embedding-v3",
        input=chunk,  # Pass a single string
        encoding_format="float"
    )
    all_embeddings.append(response.data[0].embedding)

Embedding chunks: 100%|████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00,  8.32it/s]


In [42]:
assert len(all_embeddings[0]) == 1024, "Embedding dimension mismatch"
import numpy as np
all_embeddings = [vec / np.linalg.norm(vec) for vec in all_embeddings]

In [43]:
# === Create collection if not exists ===
collection_name = "quickstart"
dashvector_client.create(name=collection_name, dimension=1024)
collection = dashvector_client.get(name=collection_name)

In [44]:
source_name = "insurance_policy_company_A.docx"  # ✅ Define the variable first
docs = [
    Doc(
        id=str(i),
        vector=embedding,
        fields={
            "text": chunks[i],
            "source": source_name,  # Now valid
            "chunk_index": i
        }
    )
    for i, embedding in enumerate(all_embeddings)
]

for i, chunk in enumerate(chunks):
    doc = Doc(
        id=str(i),
        vector=all_embeddings[i],
        fields={
            "text": chunk,  # Now a string (not a list)
            "source": source_name,
            "chunk_index": i
        }
    )
    assert isinstance(doc.fields["text"], str), "Text field must be a string"

In [45]:
resp = collection.insert(docs)
if resp.code != 0:
    print(f"Upload failed: {resp.message}")
else:
    print(f"Uploaded {len(docs)} documents")

Uploaded 22 documents


In [10]:
# docs

In [11]:
# RAG

In [32]:
from openai import OpenAI
import os
from tqdm import tqdm

# Setup DashScope OpenAI-compatible client
client = OpenAI(
    api_key="sk-91d59f4cf5ff44cf9f8280a91dacbb1c",
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
)


query = "What is the coverage limit for Third-Party liability under Comprehensive Cover?"
response = client.embeddings.create(
    model="text-embedding-v3",
    input=query,
    encoding_format="float"
)
query_embedding = response.data[0].embedding  # Extract the embedding
query_embedding = np.array(response.data[0].embedding)
query_embedding = query_embedding / np.linalg.norm(query_embedding)  # L2 normalize

In [33]:
# Verify embedding dimensions
print(f"Query embedding dim: {len(query_embedding)}")  # Should be 1024
print(f"Doc embedding dim: {len(all_embeddings[0])}")   # Should be 1024

# Ensure embeddings are lists of floats (not arrays)
query_embedding = [float(x) for x in query_embedding]  # Convert numpy array to list

Query embedding dim: 1024
Doc embedding dim: 1024


In [34]:
# Step 2: Search DashVector for top 5 similar docs
collection = dashvector_client.get(name="quickstart")
stats = collection.stats()  # Check document count
print(f"Collection stats: {stats}")  # Should show "count": len(docs)

search_results = collection.query(query_embedding, topk=10)  # Increase topk to 10
print("Retrieved chunks:")
for i, doc in enumerate(search_results):
    print(f"Rank {i+1} | Score: {doc.score} | Text: {doc.fields['text']}")



Collection stats: {"code": 0, "message": "", "requests_id": "57350736-3e2b-43a7-a872-ce11c51c7594", "output": {"total_doc_count": 22, "index_completeness": 1.0, "partitions": {"default": {"total_doc_count": 22}}}}
Retrieved chunks:
Rank 1 | Score: 0.2236 | Text: Coverage Types & Limits
Comprehensive Cover:
Collision damage (own vehicle): up to Market Value less Deductible. Third-Party liability: up to MYR 1,000,000 per claim. Third-Party, Fire & Theft:
Fire/theft damage: up to Market Value less Deductible.
Rank 2 | Score: 0.2828 | Text: At-Fault Collision: Accident where the insured is responsible for damages. Third-Party: Any other party involved in the accident. Coverage Limit: Maximum amount payable by the Company for a claim.
Rank 3 | Score: 0.3026 | Text: Third-Party liability: up to MYR 1,000,000. Basic Third-Party Only:
Third-Party liability: up to MYR 300,000. 5.
Rank 4 | Score: 0.4272 | Text: Deductible: The portion of repair costs the policyholder must bear before Company pay

In [35]:
# Step 3: Extract retrieved chunks (assuming `fields` exist in result docs)
retrieved_chunks = [doc.fields["text"] for doc in search_results]

# Step 4: Prepare prompt for Qwen with retrieved context + question
context_text = "\n---\n".join(retrieved_chunks)

prompt = f"""You are an AI assistant specialized in answering questions about insurance policies.
Your task is to provide accurate answers based solely on the provided context from the policy document.
If the context does not contain the answer, respond with "The information is not available in the provided documents.

Based on the following context:

{''.join([f'[{j+1}] {chunk}' for j, chunk in enumerate(retrieved_chunks)])}

Answer the question:
{query}

Instructions:
- Use only the context above to formulate your answer.
- If the answer is unclear, state that it is not found in the provided context.
"""

# Step 5: Call Qwen model for completion
completion = client.chat.completions.create(
    model="qwen-turbo",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.7,
    max_tokens=512,
)

print(completion.choices[0].message.content)


The coverage limit for Third-Party liability under Comprehensive Cover is up to MYR 1,000,000 per claim.


In [15]:
# # if wanna delete vector db
# # -----------------------------------------

# # Delete the existing collection (if it exists)
# try:
#     client.delete(name="quickstart")
#     print("Deleted existing collection.")
# except Exception as e:
#     print("No existing collection or delete failed:", e)

No existing collection or delete failed: delete() got an unexpected keyword argument 'name'
