In [1]:
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymilvus import MilvusClient

from tqdm.auto import tqdm
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, connections, utility


In [2]:
load_dotenv()
API_KEY=os.getenv('OPEN_API_KEY')
zilli_api_key=os.getenv('zilliAPI_KEY')


In [3]:
def extract_text_with_pypdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [4]:
# pdf_file_path = "../data/ifrs_doc.pdf"
pdf_file_path = "../data/ifrs_800pgs.pdf"

In [5]:
text = extract_text_with_pypdf(pdf_file_path)

In [6]:
client = OpenAI(api_key=API_KEY)


In [7]:
## chunking texts:

chunk_size = 6000 #openai model limit is 8191
chunk_overlap = 300

text_splitter = RecursiveCharacterTextSplitter(
                                                chunk_size=chunk_size,
                                                chunk_overlap=chunk_overlap
                                            )
chunks = text_splitter.split_text(text)

In [8]:
# embedding the chunks;
def emb_text(text):
    return (
        client.embeddings.create(input=text, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

In [9]:
milvus_client = MilvusClient(uri="https://in03-cdcb64273091f72.serverless.gcp-us-west1.cloud.zilliz.com",
                             token=zilli_api_key)

collection_name = "demo_rag_collection"


In [12]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)


schema.add_field(field_name='id', datatype=DataType.INT64, is_primary=True),
schema.add_field(field_name='embedding', datatype=DataType.FLOAT_VECTOR, dim=1536),
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=8000)

# creating index and loading on the spot :

# index_params = client.prepare_index_params()

# index_params.add_index(
#     field_name="id",
#     index_type="STL_SORT"
# )

# index_params.add_index(
#     field_name="embedding", 
#     index_type="IVF_FLAT",
#     metric_type="COSINE",
#     params={ "nlist": 128 }
# )

milvus_client.create_collection(
    collection_name=collection_name, 
    schema=schema, 
    # index_params=index_params

)



In [15]:
index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="embedding",
    metric_type="COSINE",
    index_type="IVF_FLAT",
    index_name="vector_index",
    params={ "nlist": 128 }
)

milvus_client.create_index(
    collection_name=collection_name,
    index_params=index_params,
    sync=False # Whether to wait for index creation to complete before returning. Defaults to True.
)


In [18]:
data = []
for i, chunk in enumerate(tqdm(chunks, total=len(chunks))):
    data.append({"id": i, "embedding":emb_text(chunk), "text":chunk})


100%|██████████| 350/350 [02:17<00:00,  2.55it/s]


In [19]:
milvus_client.insert(collection_name=collection_name, data=data)

{'insert_count': 350, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 

In [24]:
res = milvus_client.get_load_state(
    collection_name=collection_name
)

In [25]:
res

{'state': <LoadState: NotLoad>}

In [26]:
milvus_client.load_collection(
    collection_name=collection_name,
    replica_number=1 # Number of replicas to create on query nodes. Max value is 1 for Milvus Standalone, and no greater than `queryNode.replicas` for Milvus Cluster.
)

In [28]:
# building retrieval

question = "what is the IFRS standard for new companies looking to audit their records?"

search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "COSINE", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)


In [38]:
def get_context(question):

    search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=1,  # Return top 3 results
    search_params={"metric_type": "COSINE", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
                                    )
    
    retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
                                    ]

    context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
                        )
    
    return context



In [29]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


[
    [
        "Standards\nObjective\nThe objective of this IFRS is to ensure that an entity\u2019s first IFRS financial\nstatements , and its interim financial reports for part of the period covered by\nthose financial statements, contain high quality information that:\n(a) is transparent for users and comparable over all periods presented;\n(b) provides a suitable starting point for accounting in accordance\nwith International Financial Reporting Standards (IFRSs) ; and\n(c) can be generated at a cost that does not exceed the benefits.\nScope\nAn entity shall apply this IFRS in:\n(a) its first IFRS financial statements; and\n(b) each interim financial report, if any, that it presents in accordance\nwith IAS 34 Interim Financial Reporting  for part of the period covered by\nits first IFRS financial statements.\nAn entity\u2019s first IFRS financial statements are the first annual financial\nstatements in which the entity adopts IFRSs , by an explicit and unreserved\nstatement in thos

In [30]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)


In [35]:
def get_response_GPT(context, question):

    SYSTEM_PROMPT = """
    Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
    """
    USER_PROMPT = f"""
    Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
    <context>
    {context}
    </context>
    <question>
    {question}
    </question>
    """
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
    )

    return response.choices[0].message.content

    



In [31]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""


In [32]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)


In [33]:
print(response.choices[0].message.content)


The IFRS standard for new companies looking to audit their records is IFRS 1 - First-time Adoption of International Financial Reporting Standards. This standard sets out the requirements for entities to follow when preparing their first IFRS financial statements and interim financial reports in accordance with International Financial Reporting Standards. It aims to ensure that the financial information provided is transparent, comparable, and generated at a cost that does not exceed the benefits.


In [40]:
question = "when should an entity apply IFRS"
context = get_context(question)

In [42]:
answer = get_response_GPT(question=question, context=context)

An entity should apply IFRS when it first adopts IFRS. This is specified in the International Financial Reporting Standard (IFRS) 1, which outlines the requirements for applying IFRS in an entity's first financial statements and interim financial reports during the transition to IFRS.