**==============================================================**
#### **üß† Build Embedding Index for RAG Chatbot**
**==============================================================**

In [35]:
import os
import json
from opensearchpy import OpenSearch
from sentence_transformers import SentenceTransformer

#### **==============================================================**
#### **‚öôÔ∏è Configuration**
#### **==============================================================**

In [36]:
INDEX_NAME = "patient_risk_docs"
DATA_PATH = r"D:\Patient Risk Prediction\Patient-Risk-Prediction\chatbot\data"
EMBED_CACHE_PATH = r"D:\Patient Risk Prediction\Patient-Risk-Prediction\chatbot\cache\embeddings_cache.json"

**==============================================================**
#### **Connect to OpenSearch**
**==============================================================**

In [37]:
client = OpenSearch(
    hosts=[{"host": "localhost", "port": 9200}],
    http_auth=("admin", "admin"),
    use_ssl=False,
)

info = client.info()
print(f"‚úÖ Connected to OpenSearch {info['version']['number']}")

‚úÖ Connected to OpenSearch 2.9.0


**==============================================================**
#### **Create Index (if not exists)**
**==============================================================**

In [38]:
# Delete if exists, then recreate clean index
if client.indices.exists(index=INDEX_NAME):
    client.indices.delete(index=INDEX_NAME)
    print("üßπ Old index deleted.")

index_body = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 100,
            "knn.space_type": "cosinesimil"
        }
    },
    "mappings": {
        "properties": {
            "filename": {"type": "keyword"},
            "content": {"type": "text"},
            "embedding": {"type": "knn_vector", "dimension": 384}
        }
    }
}

client.indices.create(index=INDEX_NAME, body=index_body)
print(f"‚úÖ Created index: {INDEX_NAME}")

üßπ Old index deleted.
‚úÖ Created index: patient_risk_docs


**==============================================================**
#### **üìÇ Load Files from Data Folder**
**==============================================================**

In [39]:
files = [f for f in os.listdir(DATA_PATH) if f.endswith((".sql", ".md", ".txt"))]
documents = []

for file in files:
    path = os.path.join(DATA_PATH, file)
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
        documents.append({"filename": file, "content": text})

print(f"üìÑ Loaded {len(documents)} documents.")

üìÑ Loaded 6 documents.


#### **==============================================================**
#### **üß† Load Embedding Model (CPU)**
#### **==============================================================**

In [40]:
model = SentenceTransformer("BAAI/bge-small-en")
print("üß† Embedding model loaded successfully!")

üß† Embedding model loaded successfully!


#### **==============================================================**
#### **üíæ Check for Cached Embeddings**
#### **==============================================================**

In [41]:
if os.path.exists(EMBED_CACHE_PATH):
    with open(EMBED_CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
    print(f"‚ö° Loaded {len(cache)} embeddings from cache.")
else:
    cache = {}

#### **==============================================================**
#### **üî¢ Generate and Cache Embeddings**
#### **==============================================================**

In [42]:
for doc in documents:
    if doc["filename"] not in cache:
        emb = model.encode(doc["content"])
        cache[doc["filename"]] = {
            "content": doc["content"],
            "embedding": emb.tolist(),
        }

with open(EMBED_CACHE_PATH, "w", encoding="utf-8") as f:
    json.dump(cache, f)
print(f"üíæ Cached {len(cache)} embeddings at: {EMBED_CACHE_PATH}")

üíæ Cached 6 embeddings at: D:\Patient Risk Prediction\Patient-Risk-Prediction\chatbot\cache\embeddings_cache.json


**==============================================================**
#### **üöÄ Upload Documents to OpenSearch**
**==============================================================**

In [43]:
for i, (filename, data) in enumerate(cache.items()):
    client.index(index=INDEX_NAME, id=i + 1, body={
        "filename": filename,
        "content": data["content"],
        "embedding": data["embedding"],
    })

print(f"üöÄ Uploaded {len(cache)} documents to index: {INDEX_NAME}")

üöÄ Uploaded 6 documents to index: patient_risk_docs


**==============================================================**
#### **Verify Count**
**==============================================================**

In [44]:
count = client.count(index=INDEX_NAME)
print("üìä Document count:", json.dumps(count, indent=2))

üìä Document count: {
  "count": 6,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  }
}


In [26]:
res = client.search(index=INDEX_NAME, size=1)
print(res["hits"]["hits"][0]["_source"].keys())


dict_keys(['filename', 'content', 'embedding'])
