# RAG With llama-index  + Milvus + Qwen - Part 1

References

- https://studio.nebius.com/
- https://docs.llamaindex.ai/en/stable/examples/vector_stores/MilvusIndexDemo/
- https://docs.llamaindex.ai/en/stable/api_reference/storage/vector_store/milvus/?h=milvusvectorstore#llama_index.vector_stores.milvus.MilvusVectorStore

In [None]:
import requests

def test_nebius_connection():
    api_key = "eyJhbGciOiJIUzI1NiIsImtpZCI6IlV6SXJWd1h0dnprLVRvdzlLZWstc0M1akptWXBvX1VaVkxUZlpnMDRlOFUiLCJ0eXAiOiJKV1QifQ.eyJzdWIiOiJnb29nbGUtb2F1dGgyfDExNDI4NTUxNzcxNTU4NTg5NDYxNyIsInNjb3BlIjoib3BlbmlkIG9mZmxpbmVfYWNjZXNzIiwiaXNzIjoiYXBpX2tleV9pc3N1ZXIiLCJhdWQiOlsiaHR0cHM6Ly9uZWJpdXMtaW5mZXJlbmNlLmV1LmF1dGgwLmNvbS9hcGkvdjIvIl0sImV4cCI6MTkxODIxNTcyMSwidXVpZCI6IjAxOTllODFiLTM2ZTAtNzE5YS04M2E1LTYyNGYwMGExOTk5ZSIsIm5hbWUiOiJsdW5nYXJpYSIsImV4cGlyZXNfYXQiOiIyMDMwLTEwLTE0VDEzOjQyOjAxKzAwMDAifQ.4u3G-IDnVSHbAphyuNEYjAXcOmCSIxjPVHla9B2C7yQ"
    api_base = "https://api.studio.nebius.com/v1/"

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    try:

        response = requests.get(
            f"{api_base}/embeddings",
            headers=headers,
            timeout=10
        )

        print(f"Status Code: {response.status_code}")
        print(f"Response: {response.text}")

    except Exception as e:
        print(f"Error: {e}")

test_nebius_connection()

Status Code: 404
Response: {"detail":"Not Found"}


In [None]:
! pip install -r requirements.txt

Ignoring appnope: markers 'sys_platform == "darwin"' don't match your environment
Ignoring cffi: markers 'implementation_name == "pypy"' don't match your environment
Ignoring pycparser: markers 'implementation_name == "pypy"' don't match your environment
Ignoring pywin32: markers 'platform_python_implementation != "PyPy" and sys_platform == "win32"' don't match your environment


## Step-1: Configuration

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

if os.getenv('NEBIUS_API_KEY'):
    print ("✅ Found NEBIUS_API_KEY in environment, using it")
else:
    raise ValueError("❌ NEBIUS_API_KEY not found in environment. Please set it in .env file before running this script.")

✅ Found NEBIUS_API_KEY in environment, using it


In [None]:
load_dotenv()

if os.getenv('OPENAI_API_KEY'):
    print ("✅ Found OPENAI_API_KEY in environment, using it")
else:
    raise ValueError("❌ OPENAI_API_KEY not found in environment. Please set it in .env file before running this script.")

✅ Found OPENAI_API_KEY in environment, using it


## Step-2: Read documents

In [None]:
%%time

from llama_index.core import SimpleDirectoryReader
import pprint

# load documents
documents = SimpleDirectoryReader(
    input_dir = './test_data',
).load_data()

print (f"Loaded {len(documents)} chunks")

# print("Document [0].doc_id:", documents[0].doc_id)
# pprint.pprint (documents[0], indent=4)

Loaded 14 chunks
CPU times: user 1.06 s, sys: 1.5 ms, total: 1.06 s
Wall time: 1.08 s


## Step-3: Setup Embedding Model

We have a choice of local embedding model (fast) or running it on the cloud

If running locally:
- choose smaller models
- less accuracy but faster

If running on the cloud
- We can run large models (billions of params)

In [None]:
import os
from llama_index.core import Settings

# Option 1: Running embedding models on Nebius cloud
from llama_index.embeddings.nebius import NebiusEmbedding
EMBEDDING_MODEL = 'Qwen/Qwen3-Embedding-8B'  # 8B params
EMBEDDING_LENGTH = 4096  # Length of the embedding vector
Settings.embed_model = NebiusEmbedding(
                        model_name=EMBEDDING_MODEL,
                        embed_batch_size=50,  # Batch size for embedding (default is 10)
                        api_key=os.getenv("NEBIUS_API_KEY") # if not specfified here, it will get taken from env variable
                       )

## Option 2: Running embedding models locally
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# Settings.embed_model = HuggingFaceEmbedding(
#     # model_name = 'sentence-transformers/all-MiniLM-L6-v2' # 23 M params
#     model_name = 'BAAI/bge-small-en-v1.5'  # 33M params
#     # model_name = 'Qwen/Qwen3-Embedding-0.6B'  # 600M params
#     # model_name = 'BAAI/bge-en-icl'  # 7B params
#     #model_name = 'intfloat/multilingual-e5-large-instruct'  # 560M params
# )



## Step-4: Connect to Milvus

In [None]:
from pymilvus import MilvusClient

DB_URI = './rag.db'  # For embedded instance
COLLECTION_NAME = 'rag'

milvus_client = MilvusClient(DB_URI)
print ("✅ Connected to Milvus instance: ", DB_URI)

# if we already have a collection, clear it first
if milvus_client.has_collection(collection_name = COLLECTION_NAME):
    milvus_client.drop_collection(collection_name = COLLECTION_NAME)
    print ('✅ Cleared collection :', COLLECTION_NAME)


✅ Connected to Milvus instance:  ./rag.db
✅ Cleared collection : rag


In [None]:
%%time

# connect to vector db
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri = DB_URI ,
    dim = EMBEDDING_LENGTH ,
    collection_name = COLLECTION_NAME,
    overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to Milvus instance: ", DB_URI )

✅ Connected Llama-index to Milvus instance:  ./rag.db
CPU times: user 7.9 ms, sys: 0 ns, total: 7.9 ms
Wall time: 525 ms


## Step-5: Create Index and Save to DB

In [None]:
%%time

# create an index

from llama_index.core import VectorStoreIndex

print ("⚙️ Creating index from documents...")
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
print ("✅ Created index:", index )
print ("✅ Saved index to db ", DB_URI )

⚙️ Creating index from documents...
✅ Created index: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x78f5a44c4e00>
✅ Saved index to db  ./rag.db
CPU times: user 176 ms, sys: 39.6 ms, total: 216 ms
Wall time: 7.94 s
