In [1]:
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymilvus import MilvusClient

from tqdm.auto import tqdm
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
API_KEY=os.getenv('OPEN_API_KEY')
zilli_api_key=os.getenv('zilliAPI_KEY')


In [3]:
def extract_text_with_pypdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [4]:
# pdf_file_path = "../data/ifrs_doc.pdf"
pdf_file_path = "../data/ifrs_800pgs.pdf"

In [5]:
text = extract_text_with_pypdf(pdf_file_path)

In [6]:
client = OpenAI(api_key=API_KEY)


In [7]:
## chunking texts:

chunk_size = 6000 #openai model limit is 8191
chunk_overlap = 300

text_splitter = RecursiveCharacterTextSplitter(
                                                chunk_size=chunk_size,
                                                chunk_overlap=chunk_overlap
                                            )
chunks = text_splitter.split_text(text)

In [8]:
# embedding the chunks;
def emb_text(text):
    return (
        client.embeddings.create(input=text, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

In [9]:

embeddings = []

for chunk in tqdm(chunks, total=len(chunks)):
    embeddings.append(emb_text(chunk))

  0%|          | 0/350 [00:00<?, ?it/s]

100%|██████████| 350/350 [02:20<00:00,  2.49it/s]


In [14]:
len(embeddings[0])

1536

In [15]:
embedding_dim = len(embeddings[0])

In [11]:
milvus_client = MilvusClient(uri="https://in03-cdcb64273091f72.serverless.gcp-us-west1.cloud.zilliz.com",
                             token=zilli_api_key)

collection_name = "demo_rag_collection"


In [35]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, connections, utility


In [33]:
milvus_client = connections.connect(uri="https://in03-cdcb64273091f72.serverless.gcp-us-west1.cloud.zilliz.com",
                             token=zilli_api_key)

collection_name = "demo_rag_collection"


In [36]:
if collection_name in utility.list_collections():
    # Drop the collection if it exists
    utility.drop_collection(collection_name)
    print(f"Collection '{collection_name}' has been dropped.")

Collection 'demo_rag_collection' has been dropped.


In [37]:
# if milvus_client.has_collection(collection_name):
#     milvus_client.drop_collection(collection_name)


fields = [
        FieldSchema(name='id', dtype=DataType.INT64, is_primary=True),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, dim=1536),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8000)
]

schema = CollectionSchema(fields, description="Embedding Collection for SmartAudit")

collection = Collection(collection_name, schema=schema)

In [38]:
# inserting embeddings to collection
ids = list(range(len(embeddings)))  # Auto ID list
vectors_to_insert = np.array(embeddings, dtype=np.float32)
texts = [chunk for chunk in chunks]  

In [39]:
collection.insert([ids, vectors_to_insert, texts])


(insert count: 350, delete count: 0, upsert count: 0, timestamp: 453254575556395011, success count: 350, err count: 0, cost: 1036)

In [40]:
# Define index parameters
index_params = {
    "index_type": "IVF_FLAT",  # Or "IVF_SQ8", "HNSW" depending on your needs
    "metric_type": "L2",  # Use L2 for Euclidean distance
    "params": {"nlist": 128}  # Number of clusters
}

# Create the index on the embedding field
collection.create_index(field_name="embedding", index_params=index_params)


Status(code=0, message=)

In [41]:
# Load the collection to make it available for queries
collection.load()


In [16]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

milvus_client.create_collection(
    collection_name=collection_name,
    dimension=1536,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)


In [24]:
# inserting embeddings to collection
ids = list(range(len(embeddings)))  # Auto ID list
vectors_to_insert = np.array(embeddings, dtype=np.float32)  

In [25]:
vectors_to_insert.shape

(350, 1536)

In [19]:
insert_data = [
    vectors_to_insert  # Milvus accepts this as the embeddings data.
]

In [None]:
data = []

for i, line in enumerate(tqdm(vectors_to_insert, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

In [27]:
data = []
for i, chunk in enumerate(tqdm(chunks, total=len(chunks))):
    data.append({"id": i, "vector":emb_text(chunk), "text":chunk})

100%|██████████| 350/350 [02:33<00:00,  2.29it/s]


In [77]:
#milvus_client.insert(collection_name=collection_name, data=data)


In [41]:
index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="vector",
    metric_type="COSINE",
    index_type="IVF_FLAT",
    index_name="vector_index",
    params={ "nlist": 128 }
)

milvus_client.create_index(
    collection_name="demo_rag_collection",
    index_params=index_params,
    sync=False # Whether to wait for index creation to complete before returning. Defaults to True.
)



RPC error: [create_index], <MilvusException: (code=65535, message=CreateIndex failed: creating multiple indexes on same field is not supported)>, <Time:{'RPC start': '2024-10-15 20:15:04.254534', 'RPC error': '2024-10-15 20:15:25.368749'}>
Failed to create an index on collection: demo_rag_collection


MilvusException: <MilvusException: (code=65535, message=CreateIndex failed: creating multiple indexes on same field is not supported)>

In [43]:
milvus_client.drop_index(collection_name)


TypeError: drop_index() missing 1 required positional argument: 'index_name'

In [46]:
res = milvus_client.list_indexes(
    collection_name=collection_name
)

print(res)

['vector']


In [31]:
# Building Retrieval ;

In [42]:
question = "what is the IFRS standard for new companies looking to audit their records?"


In [43]:
results = collection.search(
    data=[
        emb_text(question)
    ],
    anns_field="embedding",
    param={"nprobe": 10},  # Number of clusters to search
    limit=5,
    metric_type="L2"
)


In [48]:
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
results = collection.search([emb_text(question)], "embedding", param=search_params, limit=5, output_fields=["text"])


In [67]:
results[0][0][0]

TypeError: 'Hit' object is not subscriptable

In [64]:
dict(results[0][0])
     

MilvusException: <MilvusException: (code=1, message=Field keys is not in the hit entity)>

In [59]:
# Assuming 'results' is your SearchResult object
for hit in results:
    # Each hit is an instance of Hit
    id = hit.id
    distance = hit.distance
    entity = hit.entity  # This contains your additional fields

    # Assuming 'text' is a field within the entity
    text = entity.get("text", "No text found")  # Safely retrieve text

    # Print or store the retrieved information
    print(f"ID: {id}, Distance: {distance}, Text: {text}")


AttributeError: 'Hits' object has no attribute 'id'

In [44]:
import json
retrieved_lines_with_distances = [
    (res.entity.text, res.distance) for res in results[0]
]

# Print the formatted results as JSON
print(json.dumps(retrieved_lines_with_distances, indent=4))


MilvusException: <MilvusException: (code=1, message=Field text is not in the hit entity)>

In [47]:
search_res = collection.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    anns_field="embedding",
    param={"nprobe": 10}, 
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)


Unexpected error: [search], search() got multiple values for argument 'collection_name', <Time: {'RPC start': '2024-10-15 21:54:18.084918', 'Exception': '2024-10-15 21:54:18.084947'}>


MilvusException: <MilvusException: (code=1, message=Unexpected error, message=<search() got multiple values for argument 'collection_name'>)>

In [33]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)


In [26]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in results[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


TypeError: 'Hit' object is not subscriptable

In [35]:
## Use LLM to get a RAG response

In [36]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)


In [78]:
# for res in results[0]:
#     print(res)

In [75]:
context = "\n".join(
    [res for res in results[0]]
)

TypeError: sequence item 0: expected str instance, Hit found

In [37]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""


In [38]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)


In [76]:
#print(response.choices[0].message.content)
