## 04.03 Setting up the Milvus Cache

In [1]:
#Setup database & collection
from pymilvus import connections
from pymilvus import db,Collection

from pymilvus import utility

#Names for connections, database and collections
conn_name = "cache_conn"
db_name="cache_db"
collection_name="llm_cache"

#Create a connection to Milvus
connections.add_connection(
    cache_conn={
        "host": "localhost",
        "port": "19530",
        "username" : "username",
        "password" : "password"
    })


#Connect
connections.connect(conn_name)

#Create a DB if not already present
current_dbs=db.list_database(using=conn_name)

if ( db_name not in current_dbs):
    print("Creating database :", db_name)
    resume_db = db.create_database(db_name, using=conn_name) #default db is "default"
else:
    print(db_name, ": Database already exists")

#Switch to the new database
db.using_database(db_name, using=conn_name)

Creating database : cache_db


In [38]:
#Create a Collection for cache
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
import json

#Define fields in the cache
#Autogenerated ID field for each entity
cache_id = FieldSchema(
    name="cache_id",
    dtype=DataType.INT64,
    auto_id=True,
    is_primary=True,
    max_length=32)

#Text for the input prompt
prompt_text= FieldSchema(
    name="prompt_text",
    dtype=DataType.VARCHAR,
    max_length=2048)

#Text for the LLM response
response_text= FieldSchema(
    name="response_text",
    dtype=DataType.VARCHAR,
    max_length=2048)

#Embedding for the input prompt
prompt_embedding = FieldSchema(
    name="prompt_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=1536 #Define based on embedding used
)

#Define the schema for the cache collection
cache_schema=CollectionSchema(
    fields=[cache_id, prompt_text, response_text, prompt_embedding],
    description="Cache for LLM",
    enable_dynamic_field=True
)

#Create the collection
cache_collection=Collection(
    name=collection_name,
    schema=cache_schema,
    using=conn_name,
    shard_num=2
)

print("Schema : ", cache_collection.schema, "\n")

#Build an index for the prompt embedding field
index_params = {
    "metric_type":"L2",
    "index_type":"IVF_FLAT",
    "params" :{"nlist":1024}
}

cache_collection.create_index(
    field_name="prompt_embedding",
    index_params=index_params
)

#Flush the collection to persist
cache_collection.flush()
#Load the collection in memory
cache_collection.load()

Schema :  {'auto_id': True, 'description': 'Cache for LLM', 'fields': [{'name': 'cache_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'prompt_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'response_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'prompt_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1536}}], 'enable_dynamic_field': True} 



## 04.04. Inference Process with caching

In [5]:
!pip list | findstr "langchain pydantic"

langchain                 0.3.15
langchain-community       0.0.20
langchain-core            0.3.31
langchain-openai          0.3.2
langchain-text-splitters  0.3.5
pydantic                  2.10.6
pydantic_core             2.27.2


In [6]:
!pip install --upgrade langchain langchain-community

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-community
  Downloading langchain_community-0.3.15-py3-none-any.whl.metadata (2.9 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Downloading langchain_community-0.3.15-py3-none-any.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ------------------------------------- -- 2.4/2.5 MB 16.8 MB/s eta 0:00:01
   ---------------------------------------- 2.5/2.5 MB 12.0 MB/s eta 0:00:00
Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading pydantic_settings-2.7.1-py3-none-any.whl (29 kB)
Installing collected packages: httpx-sse, pydantic-settings, langchain-community
  Attempting uninstall: langchain-community
    Found existin

In [7]:
!pip list | findstr "langchain pydantic"

langchain                 0.3.15
langchain-community       0.3.15
langchain-core            0.3.31
langchain-openai          0.3.2
langchain-text-splitters  0.3.5
pydantic                  2.10.6
pydantic_core             2.27.2
pydantic-settings         2.7.1


In [9]:
pip install -U langchain-openai

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [10]:
!pip list | findstr "langchain pydantic"

langchain                 0.3.15
langchain-community       0.3.15
langchain-core            0.3.31
langchain-openai          0.3.2
langchain-text-splitters  0.3.5
pydantic                  2.10.6
pydantic_core             2.27.2
pydantic-settings         2.7.1


In [39]:
from transformers import AutoTokenizer
#from langchain.llms import OpenAI
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage
from dotenv import load_dotenv
import os
import time

# Load environment variables from .env file
load_dotenv()

# Retrieve the OpenAI API key from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key

#Create an LLM object
#llm = ChatOpenAI(temperature=0., model="gpt-4")
llm = ChatOpenAI(temperature=0., model="gpt-3.5-turbo")

#Setup embedding model for creating embeddings
embeddings_model = OpenAIEmbeddings()

#setup threshold for similarity between vectors
similarity_threshold=0.3

search_params = {
    "metric_type": "L2", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 20, "radius":similarity_threshold}
}


 # Function to run the inference loop
def get_response(prompt):
    start_time = time.time()

    # Create embedding for incoming prompt
    prompt_embed = embeddings_model.embed_query(prompt)

    # Check cache if result exists
    cache_results = cache_collection.search(
        data=[prompt_embed],
        anns_field="prompt_embedding",
        param=search_params,
        limit=1,  # Look for the top result only
        expr=None,
        output_fields=["prompt_text", "response_text"],
        consistency_level="Strong"
    )

    returned_response = "None"

    if len(cache_results[0]) > 0:
        # Cache hit
        print(prompt, ":\n Cache hit: ", cache_results[0])
        returned_response = cache_results[0][0].entity.get("response_text")
    else:
        # Create messages with proper types
        messages = [
            SystemMessage(content="You are a helpful assistant."),
            HumanMessage(content=prompt)
        ]
        # Use invoke() method for chat-based models
        llm_response = llm.invoke(messages)
        print(prompt, ":\n LLM returned:", llm_response.content)
        returned_response = llm_response.content

        # Save prompt/response to cache
        prompt_text = [prompt]
        prompt_embedding = [prompt_embed]
        response_text = [llm_response.content]

        insert_data = [prompt_text, response_text, prompt_embedding]
        mr = cache_collection.insert(insert_data)

    end_time = time.time()
    print("Time elapsed:", end_time - start_time, "\n")
    return returned_response

In [40]:
#Build up the cache
response=get_response("In which year was Abraham Lincoln born?")
response=get_response("What is distance between the sun and the moon?")
response=get_response("How many years have Lebron James played in the NBA?")
response=get_response("What are the advantages of the python language?")
response=get_response("What is the typical height of an elephant")


In which year was Abraham Lincoln born? :
 LLM returned: Abraham Lincoln was born on February 12, 1809.
Time elapsed: 1.7532482147216797 

What is distance between the sun and the moon? :
 LLM returned: The average distance between the Sun and the Moon is about 238,855 miles (384,400 kilometers). This distance can vary slightly due to the elliptical orbits of both the Earth around the Sun and the Moon around the Earth.
Time elapsed: 1.685469627380371 

How many years have Lebron James played in the NBA? :
 LLM returned: As of the 2021-2022 NBA season, LeBron James has played 19 seasons in the NBA. He made his debut in the 2003-2004 season.
Time elapsed: 1.3154313564300537 

What are the advantages of the python language? :
 LLM returned: Python is a popular programming language known for its simplicity and readability. Some of the advantages of Python include:

1. Easy to learn and use: Python has a simple and clean syntax that makes it easy for beginners to learn and understand.

2. V

In [41]:
response=get_response("List some advantages of the python language")
response=get_response("How tall is an elephant?")

List some advantages of the python language :
 Cache hit:  ['id: 455601127240531277, distance: 0.048949599266052246, entity: {\'prompt_text\': \'What are the advantages of the python language?\', \'response_text\': "Python is a popular programming language known for its simplicity and readability. Some of the advantages of Python include:\\n\\n1. Easy to learn and use: Python has a simple and clean syntax that makes it easy for beginners to learn and understand.\\n\\n2. Versatile: Python can be used for a wide range of applications, including web development, data analysis, artificial intelligence, machine learning, and more.\\n\\n3. Large standard library: Python comes with a large standard library that provides ready-to-use modules and packages for various tasks, reducing the need to write code from scratch.\\n\\n4. Community support: Python has a large and active community of developers who contribute to its growth and provide support through forums, tutorials, and documentation.\\n