In [1]:
from langchain_chroma import Chroma
import os
from pyprojroot import here
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
from dotenv import load_dotenv
from pprint import pprint
load_dotenv()

True

**Load environment variables and configs**

In [2]:
openai_api_key = os.getenv("OPEN_AI_API_KEY")
if not openai_api_key:
    raise ValueError("OPEN_AI_API_KEY environment variable not set.")
os.environ['OPENAI_API_KEY'] = openai_api_key

EMBEDDING_MODEL = "text-embedding-3-small"
VECTORDB_DIR = "data/csv_vectordb"
K = 3

**Load the vectorDB**

In [3]:
vectordb = Chroma(
    collection_name="csv-rag-chroma",
    persist_directory=str(here(VECTORDB_DIR)),
    embedding_function=OpenAIEmbeddings(model=EMBEDDING_MODEL)
)
print("Number of vectors in vectordb:",
      vectordb._collection.count(), "\n\n")

Number of vectors in vectordb: 21024 




**Sample Query**

In [4]:
message = "How many clusters are there in the data?"

**Perform the vector Search**

In [5]:
docs = vectordb.similarity_search(message, k=K)

In [6]:
docs

[Document(metadata={'row': 9989, 'source': 'C:\\Users\\Devansh1.Sharma\\Desktop\\Final\\data\\docs\\csv_files\\cluster_data.csv'}, page_content='9afe5b1297555a8dd87cbda7672ce36666f0a4689acb3d52072118a06d9a82764a094528875fe0d9326cf8bc5e183e06f9e850ab997e52b3cf6024b\nESCALATION_ROLE: \nWAR_ROOM_NOTIFIED: N\nIS_IMPACT: N\nPRODUCT_CATEGORY: \ndescription: Node Network Interface - packets outbound errors on Node nvmbhcicl01n04\ntitle: Node Network Interface - packets outbound errors on Node nvmbhcicl01n04\nkpiName: HCIMachine\nalert_type: Network Interface_packets_outbound_errors\nip_address: \nservice_name: \nclean_description: node network interface packets outbound errors on node nvmbhcicln\nenhanced_cluster_id: 35\nenhanced_cluster_name: Cluster_35'),
 Document(metadata={'row': 7655, 'source': 'C:\\Users\\Devansh1.Sharma\\Desktop\\Final\\data\\docs\\csv_files\\cluster_data.csv'}, page_content='0\nMETRIC_ID: 2023021716356527.0\nASSETID: 202502171559218.0\nENVIORMENT: Production\nASSET_TY

**Prepare the prompt for the GPT model**

In [7]:
question = "# User new question:\n" + message
retrieved_content = ""
for doc in docs:
    retrieved_content += f"{doc.page_content}\n\n"
prompt = f"# Content:\n{retrieved_content}\n\n{question}"

Prepared prompt

In [8]:
print(prompt)

# Content:
9afe5b1297555a8dd87cbda7672ce36666f0a4689acb3d52072118a06d9a82764a094528875fe0d9326cf8bc5e183e06f9e850ab997e52b3cf6024b
ESCALATION_ROLE: 
WAR_ROOM_NOTIFIED: N
IS_IMPACT: N
PRODUCT_CATEGORY: 
description: Node Network Interface - packets outbound errors on Node nvmbhcicl01n04
title: Node Network Interface - packets outbound errors on Node nvmbhcicl01n04
kpiName: HCIMachine
alert_type: Network Interface_packets_outbound_errors
ip_address: 
service_name: 
clean_description: node network interface packets outbound errors on node nvmbhcicln
enhanced_cluster_id: 35
enhanced_cluster_name: Cluster_35

0
METRIC_ID: 2023021716356527.0
ASSETID: 202502171559218.0
ENVIORMENT: Production
ASSET_TYPE: 
HIERARCHY_NAME: QUALITY ASSURANCE & TESTING
TECHNOLOGY_CATEGORY: Operating System
TECHNOLOGY_NAME: RHEL
METRIC_NAME: filesystem_utilization
COLLABORATION_FLAG: Y
INSTANCE_ID: 501.0
DESCRIPTOR_ID: 202403291558158.0
DESCRIPTOR_NAME: Filesystem Utilization Percentage
MASTER_ALERT_ID: 
ALERT_UUID

**Pass the prompt to the GPT model and get the response**

In [9]:
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You will receive a user's query and possible content where the answer might be. If the answer is found, provide it, if not, state that the answer does not exist."},
        {"role": "user", "content": prompt}
    ]
)

Printing the response

In [10]:
pprint(response.choices[0].message.content)

'There are two clusters in the data: Cluster_35 and Cluster_18.'


**RAG Tool design using LangChain**

In [15]:
from langchain_core.tools import tool

@tool
def lookup_rag_information(query: str)->str:
    """Search within the VECTORDB to find the relevant information. Input should be a search query."""
    vectordb = Chroma(
    collection_name="csv-rag-chroma",
    persist_directory=str(here(VECTORDB_DIR)),
    embedding_function=OpenAIEmbeddings(model=EMBEDDING_MODEL)
    )
    docs = vectordb.similarity_search(query, k=K)
    return "\n\n".join([doc.page_content for doc in docs])

In [16]:
print(lookup_rag_information.name)
print(lookup_rag_information.args)
print(lookup_rag_information.description)

lookup_rag_information
{'query': {'title': 'Query', 'type': 'string'}}
Search within the VECTORDB to find the relevant information. Input should be a search query.


In [17]:
pprint(lookup_rag_information.invoke("how many clusters are there in the data?"))

('9afe5b1297555a8dd87cbda7672ce36666f0a4689acb3d52072118a06d9a82764a094528875fe0d9326cf8bc5e183e06f9e850ab997e52b3cf6024b\n'
 'ESCALATION_ROLE: \n'
 'WAR_ROOM_NOTIFIED: N\n'
 'IS_IMPACT: N\n'
 'PRODUCT_CATEGORY: \n'
 'description: Node Network Interface - packets outbound errors on Node '
 'nvmbhcicl01n04\n'
 'title: Node Network Interface - packets outbound errors on Node '
 'nvmbhcicl01n04\n'
 'kpiName: HCIMachine\n'
 'alert_type: Network Interface_packets_outbound_errors\n'
 'ip_address: \n'
 'service_name: \n'
 'clean_description: node network interface packets outbound errors on node '
 'nvmbhcicln\n'
 'enhanced_cluster_id: 35\n'
 'enhanced_cluster_name: Cluster_35\n'
 '\n'
 '0\n'
 'METRIC_ID: 2023021716356527.0\n'
 'ASSETID: 202502171559218.0\n'
 'ENVIORMENT: Production\n'
 'ASSET_TYPE: \n'
 'HIERARCHY_NAME: QUALITY ASSURANCE & TESTING\n'
 'TECHNOLOGY_CATEGORY: Operating System\n'
 'TECHNOLOGY_NAME: RHEL\n'
 'METRIC_NAME: filesystem_utilization\n'
 'COLLABORATION_FLAG: Y\n'
 'INS