# Milvus - llama - phoenix integration

In [1]:
!python3 -m pip install "milvus[client]"
!pip install -q arize-phoenix gcsfs llama-index
!pip install google-cloud-storage
!pip install pymilvus



# Necessary imports

In [4]:
import json
import pandas as pd
from gcsfs import GCSFileSystem
from llama_index import StorageContext, load_index_from_storage
from llama_index.vector_stores import MilvusVectorStore, SimpleVectorStore
from llama_index import SimpleDirectoryReader
from llama_index import VectorStoreIndex, Document
from milvus import default_server
from pymilvus import (
    connections,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    utility,
)
from datetime import timedelta
from functools import reduce
import hashlib
import json
import logging
import os
import sys
import tempfile
import textwrap
from tqdm import tqdm
from typing import Any, Dict, List, Tuple
import urllib
import zipfile

from gcsfs import GCSFileSystem
from IPython.display import YouTubeVideo
from llama_index.graph_stores.simple import SimpleGraphStore
from langchain.chat_models import ChatOpenAI
from llama_index import StorageContext, load_index_from_storage
from llama_index.callbacks import CallbackManager, OpenInferenceCallbackHandler
from llama_index.callbacks.open_inference_callback import as_dataframe
from llama_index.embeddings.base import BaseEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.indices.query.schema import QueryBundle
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response.schema import Response
from llama_index import ServiceContext, LLMPredictor
from llama_index import StorageContext, load_index_from_storage
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.response.schema import Response
import numpy as np
import numpy.typing as npt
import openai
import pandas as pd
import phoenix as px
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)


pd.set_option("display.max_colwidth", 1000)

openai_api_key = "place your openai api key in here"
assert openai_api_key == "place your openai api key in here", "❌ Please set your OpenAI API key"
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

# Recover data from dataset and get it in the proper format for milvus insertion

In [5]:
with open("dataset.json", "r") as file:
    data = json.load(file)

    rows = data['rows']

    # If the 'node' field is a JSON object, convert it to a string
    for row in rows:
        if isinstance(row['node'], dict):
            row['node'] = json.dumps(row['node'])

# Creating a milvus lite connection and inserting data into it


In [6]:
with default_server:
    connections.connect(host='localhost', port=default_server.listen_port)
    collection_name = 'colab_collection'

    # define your fields
    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=65535),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1536),
        FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="node", dtype=DataType.VARCHAR, max_length=65535),
    ]

    # define your schema
    schema = CollectionSchema(fields=fields, description="this is a collection basedc on arize docs")

    # Transform the data into the format needed for the insert operation
    insert_data = [
        [record[field.name] for record in rows] for field in schema.fields
    ]
    # create your collection
    collection = Collection(name=collection_name, schema=schema)

    # Insert the data into the collection
    mr = collection.insert(insert_data)

    file_system = GCSFileSystem(project="public-assets-275721")
    index_path = "arize-assets/phoenix/datasets/unstructured/llm/llama-index/arize-docs/index/"
    vector_store = MilvusVectorStore(
        host='localhost', 
        port=default_server.listen_port,
        collection_name="colab_collection",
        overwrite='False'
    )
    
    storage_context = StorageContext.from_defaults(
        fs=file_system,
        persist_dir=index_path,
        graph_store=SimpleGraphStore(),
        vector_store = vector_store
    )

    callback_handler = OpenInferenceCallbackHandler()
    service_context = ServiceContext.from_defaults(
        llm_predictor=LLMPredictor(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)),
        embed_model=OpenAIEmbedding(model="text-embedding-ada-002"),
        callback_manager=CallbackManager(handlers=[callback_handler]),
    )
    
    index = load_index_from_storage(
        storage_context,
        service_context=service_context,
    )
    
    query_engine = index.as_query_engine()
    
    max_line_length = 80
    for query in [
        "How do I get an Arize API key?",
        "Can I create monitors with an API?",
        "How do I need to format timestamps?",
        "What is the price of the Arize platform",
    ]:
        print("Query")
        print("=====")
        print()
        print(textwrap.fill(query, max_line_length))
        print()
        response = query_engine.query(query)
        print("Response")
        print("========")
        print()
        print(textwrap.fill(str(response), max_line_length))
        print()



    __  _________ _   ____  ______
   /  |/  /  _/ /| | / / / / / __/
  / /|_/ // // /_| |/ / /_/ /\ \
 /_/  /_/___/____/___/\____/___/ {Lite}

 Welcome to use Milvus!

 Version:   v2.2.12-lite
 Process:   61956
 Started:   2023-08-07 12:36:39
 Config:    /home/ruiciro/.milvus.io/milvus-server/2.2.12/configs/milvus.yaml
 Logs:      /home/ruiciro/.milvus.io/milvus-server/2.2.12/logs

 Ctrl+C to exit ...
Query
=====

How do I get an Arize API key?

Response

To get an Arize API key, you need to click the 'Get Your API Key' button on the
top right of the Explorer page.

Query
=====

Can I create monitors with an API?

Response

Yes, you can create monitors with an API using the public-facing graphQL API
provided.

Query
=====

How do I need to format timestamps?

Response

To format timestamps, you can use either seconds or an RFC3339 timestamp. If you
don't provide a timestamp, it will automatically default to the time when the
file was ingested.

Query
=====

What is the price of the 