In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

ASI_ONE_KEY = os.getenv("ASI_ONE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "asi1-mini"

### Pre-procesare ###

In [3]:
import pandas as pd
import requests
import os
from langchain_community.document_loaders import PyPDFLoader


os.makedirs("pdfs", exist_ok=True)

df = pd.read_csv("dataset.csv")
links = df['link'].tolist()

pdf_files = []

for i, link in enumerate(links):
    filename = f"pdfs/doc_{i}.pdf"
    try:
        r = requests.get(link)
        if r.ok:
            with open(filename, "wb") as f:
                f.write(r.content)
            pdf_files.append(filename)
            print(f"Downloaded {filename}")
        else:
            print(f"Failed to download {link}")
    except Exception as e:
        print(f"Error downloading {link}: {e}")

Downloaded pdfs/doc_0.pdf
Downloaded pdfs/doc_1.pdf
Downloaded pdfs/doc_2.pdf
Downloaded pdfs/doc_3.pdf
Downloaded pdfs/doc_4.pdf
Downloaded pdfs/doc_5.pdf
Downloaded pdfs/doc_6.pdf
Downloaded pdfs/doc_7.pdf
Downloaded pdfs/doc_8.pdf
Downloaded pdfs/doc_9.pdf
Downloaded pdfs/doc_10.pdf
Downloaded pdfs/doc_11.pdf
Downloaded pdfs/doc_12.pdf
Downloaded pdfs/doc_13.pdf
Downloaded pdfs/doc_14.pdf
Downloaded pdfs/doc_15.pdf
Downloaded pdfs/doc_16.pdf
Downloaded pdfs/doc_17.pdf
Downloaded pdfs/doc_18.pdf
Downloaded pdfs/doc_19.pdf
Downloaded pdfs/doc_20.pdf
Downloaded pdfs/doc_21.pdf
Downloaded pdfs/doc_22.pdf
Downloaded pdfs/doc_23.pdf
Downloaded pdfs/doc_24.pdf
Downloaded pdfs/doc_25.pdf
Downloaded pdfs/doc_26.pdf
Downloaded pdfs/doc_27.pdf
Downloaded pdfs/doc_28.pdf
Downloaded pdfs/doc_29.pdf
Downloaded pdfs/doc_30.pdf
Downloaded pdfs/doc_31.pdf
Downloaded pdfs/doc_32.pdf
Downloaded pdfs/doc_33.pdf
Downloaded pdfs/doc_34.pdf
Downloaded pdfs/doc_35.pdf
Downloaded pdfs/doc_36.pdf
Downloaded 

In [19]:
full_document = []

for filepath in pdf_files:
    try:
        loader = PyPDFLoader(filepath)
        docs = loader.load()
        full_document.extend(docs)
        print(f"Parsed {filepath} with {len(docs)} chunks.")
    except Exception as e:
        print(f"Error parsing {filepath}: {e}")

Parsed pdfs/doc_0.pdf with 6 chunks.


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x70419b155640>>
Traceback (most recent call last):
  File "/home/deni/Desktop/disi/.venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Parsed pdfs/doc_1.pdf with 22 chunks.
Parsed pdfs/doc_2.pdf with 3 chunks.
Parsed pdfs/doc_3.pdf with 31 chunks.
Parsed pdfs/doc_4.pdf with 30 chunks.
Parsed pdfs/doc_5.pdf with 21 chunks.
Parsed pdfs/doc_6.pdf with 20 chunks.
Parsed pdfs/doc_7.pdf with 6 chunks.
Parsed pdfs/doc_8.pdf with 3 chunks.
Parsed pdfs/doc_9.pdf with 26 chunks.
Parsed pdfs/doc_10.pdf with 20 chunks.
Parsed pdfs/doc_11.pdf with 5 chunks.
Parsed pdfs/doc_12.pdf with 5 chunks.
Parsed pdfs/doc_13.pdf with 7 chunks.
Parsed pdfs/doc_14.pdf with 11 chunks.
Parsed pdfs/doc_15.pdf with 5 chunks.
Parsed pdfs/doc_16.pdf with 3 chunks.


KeyboardInterrupt: 

In [5]:
print(full_document[0])

page_content='12 HOUR NASAL DECONGESTANT- pseudoephedrine hydrochloride tablet, film
coated 
 
CARDINAL HEALTH
----------
Pseudoephedrine hydrochloride
Drug Facts
Active ingredient (in each tablet)
Pseudoephedrine HCl, USP 120 mg
Purpose
Nasal decongestant
Uses
temporarily relieves nasal congestion due to the common cold, hay fever or other
upper respiratory allergies
temporarily relieves sinus congestion and pressure
Do not use
 if you are now taking a prescription monoamine oxidase inhibitor (MAOI)
(certain drugs for depression, psychiatric or emotional conditions, or Parkinson's
disease), or for 2 weeks after stopping the MAOI drug. If you do not know if your
prescription drug contains an MAOI, ask a doctor or pharmacist before taking this
product.
Ask a doctor before use if you have
heart disease
high blood pressure
thyroid disease
diabetes
trouble urinating due to an enlarged prostate gland
When using this product do not exceed recommended dosage
Stop use and ask a doctor if
nervo

### Split ###

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(full_document)

### Index with Retrieval ###

In [7]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings(model="text-embedding-3-small"))

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


### Model ###

In [8]:
def call_asi_one(prompt):
    if hasattr(prompt, "to_string"):
        prompt = prompt.to_string()  # Convert PromptValue to str

    url = "https://api.asi1.ai/v1/chat/completions"
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {ASI_ONE_KEY}'
    }
    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
        "max_tokens": 1000
    }
    response = requests.post(url, headers=headers, json=payload)
    return response.json().get("choices", [{}])[0].get("message", {}).get("content", "No response")

In [9]:
# LLM
llm = call_asi_one

### Multi Query ###

In [10]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Prompt
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}
"""

prompt_perspectives = ChatPromptTemplate.from_template(template)


generate_queries = (
    prompt_perspectives 
    | llm 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [11]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve

retrieval_chain = generate_queries | retriever.map() | get_unique_union



In [13]:
from operator import itemgetter
from langchain.prompts import PromptTemplate

# RAG
template = """
Answer the question only based on the context below use only the given knowledge from the context do not invent or add details that you were treained. 
If you dont know the answer say that you dont know.
 
Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)


chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)



In [14]:
chain.invoke({"question":"What is NASAL DECONGESTANT?"})

  return [loads(doc) for doc in unique_docs]




### RAG-Fusion ###

In [26]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [27]:

from langchain_core.output_parsers import StrOutputParser


generate_queries = (
    prompt_rag_fusion 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [28]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion


In [29]:


from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)


In [30]:
final_rag_chain.invoke({"question":"What is Atripla?"})



### Decomposition ###

In [42]:
from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [43]:
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [45]:


from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()

generate_queries_decomposition = (
    prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n"))
)
def answer_fn2(question, history=None):
    # Step 1: Generate sub-questions
    questions = generate_queries_decomposition.invoke({"question": question})
    
    q_a_pairs = ""
    
    for q in questions:
        rag_chain = (
            {
                "context": itemgetter("question") | retriever,
                "question": itemgetter("question"),
                "q_a_pairs": itemgetter("q_a_pairs")
            }
            | decomposition_prompt
            | llm
            | StrOutputParser()
        )
        
        answer = rag_chain.invoke({"question": q, "q_a_pairs": q_a_pairs})
        q_a_pair = format_qa_pair(q, answer)
        q_a_pairs = q_a_pairs + "\n---\n" + q_a_pair
    
    # Final full RAG answer based on original question
    final_rag_chain = (
        {
            "context": itemgetter("question") | retriever,
            "question": itemgetter("question"),
            "q_a_pairs": itemgetter("q_a_pairs")
        }
        | decomposition_prompt
        | llm
        | StrOutputParser()
    )
    
    return final_rag_chain.invoke({"question": question, "q_a_pairs": q_a_pairs})

### Step Back ###

In [52]:
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda
from operator import itemgetter

# Few-shot examples
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]

# Prompt setup
example_prompt = ChatPromptTemplate.from_messages([
    ("human", "{input}"),
    ("ai", "{output}"),
])
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

step_back_template = ChatPromptTemplate.from_messages([
    ("system", "You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:"),
    few_shot_prompt,
    ("user", "{question}"),
])
generate_queries_step_back = step_back_template | llm | StrOutputParser()


In [53]:
response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# {normal_context}
# {step_back_context}

# Original Question: {question}
# Answer:"""
response_prompt = ChatPromptTemplate.from_template(response_prompt_template)


In [54]:
def answer_fn_step_back(question):
    step_back_chain = (
        {
            # Retrieve context using the normal question
            "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
            # Retrieve context using the step-back question
            "step_back_context": generate_queries_step_back | retriever,
            # Pass original question
            "question": lambda x: x["question"],
        }
        | response_prompt
        | llm
        | StrOutputParser()
    )
    return step_back_chain.invoke({"question": question})


### Hyde ###

In [62]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Step 1: HyDE Prompt
hyde_template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(hyde_template)

generate_docs_for_retrieval = (
    prompt_hyde | llm | StrOutputParser()
)

In [63]:
# Step 2: Final RAG prompt
rag_template = """Answer the following question based on this context:

{context}

Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)

final_rag_chain = (
    rag_prompt
    | llm
    | StrOutputParser()
)

In [64]:
def answer_fn_hyde(question):
    # 1. Generate HyDE passage
    pseudo_doc = generate_docs_for_retrieval.invoke({"question": question})
    
    # 2. Retrieve based on pseudo-doc
    retrieved_docs = retriever.invoke(pseudo_doc)
    
    # 3. Answer with RAG
    answer = final_rag_chain.invoke({
        "context": retrieved_docs,
        "question": question
    })
    
    return answer

### Giskard ###

In [15]:
import pandas as pd

df = pd.DataFrame([d.page_content for d in full_document], columns=["text"])

df.head(100)

Unnamed: 0,text
0,12 HOUR NASAL DECONGESTANT- pseudoephedrine hy...
1,Directions\nadults and children 12 years\nand ...
2,COMPARE TO\nSUDAFED\n SINUS\nCONGESTION 12 HOU...
3,
4,12 HOUR NASAL DECONGESTANT \npseudoephedrine ...
...,...
95,absolute risk increase per mmHg is greater at ...
96,been studied. If treatment is to be discontinu...
97,dose of LOPRESSOR \n[see \nDosage and Administ...
98,5.\n9\n \n \nPeripheral Vascular Disease\nBet...


In [16]:
import giskard


giskard.llm.set_llm_model("gpt-3.5-turbo")
giskard.llm.set_embedding_model("text-embedding-3-small")

In [17]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(df)

In [18]:
from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base,
    num_questions=110,
    agent_description="A chatbot answering questions about medicine drugs based on a given context",
)

2025-06-16 10:04:54,620 pid:18318 MainThread giskard.rag  INFO     Finding topics in the knowledge base.




2025-06-16 10:06:25,446 pid:18318 MainThread giskard.rag  INFO     Found 54 topics in the knowledge base.


Generating questions:   0%|          | 0/110 [00:00<?, ?it/s]

In [26]:
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("------------------------------", end="\n\n")

Question 1: What are the directions for taking Advil Gel Caplets?
Reference answer: For Advil Gel Caplets, adults and children 12 years and over should take 1 gel caplet every 4 to 6 hours while symptoms persist. The smallest effective dose should be used. Do not exceed 6 gel caplets in 24 hours, unless directed by a doctor. Children under 12 years should ask a doctor before taking.
Reference context:
Document 8: vomit blood
have bloody or black stools
have stomach pain that does not get better
you have symptoms of heart problems or stroke:
chest pain
trouble breathing
weakness in one part or side of body
slurred speech
leg swelling
pain gets worse or lasts more than 10 days
fever gets worse or lasts more than 3 days
redness or swelling is present in the painful area
any new symptoms appear
If pregnant or breast-feeding,
ask a health professional before use. It is especially important not to use ibuprofen at 20
weeks or later in pregnancy unless definitely directed to do so by a doctor

In [27]:
testset.save("test-set.jsonl")

In [28]:
def answer_fn(question, history=None):
    return chain.invoke({"question": question})
   

In [29]:
from giskard.rag import evaluate

report = evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)
display(report)

Asking questions to the agent:   0%|          | 0/10 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

In [67]:
report.to_html("report.html")

In [68]:
report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.0
conversational,0.0
distracting element,0.5
double,0.0
simple,0.5
situational,0.0


In [69]:
report.get_failures()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata,agent_answer,correctness,correctness_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
76e4e628-e8be-4675-a04e-1208c39eb612,What is the strength of ALBUTEROL SULFATE in t...,The strength of ALBUTEROL SULFATE in the solut...,Document 215: ALBUTEROL SULFATE \nalbuterol s...,[],"{'question_type': 'simple', 'seed_document_id'...",The ALBUTEROL SULFATE inhalation solution is a...,False,The agent provided additional information abou...
3a0b5032-84e3-405d-a8c9-43d62cd79c3c,What are the recommended precautions to be tak...,"When discontinuing therapy with Lopressor, par...",Document 181: been studied. If treatment is to...,[],"{'question_type': 'complex', 'seed_document_id...",The document highlights the following precauti...,False,The agent provided additional information abou...
2e22dd3f-eae2-446f-a07b-5410126788cd,What adverse reactions have been reported duri...,The adverse reactions identified during post a...,Document 184: Lopressor\nPlacebo\nHypotension ...,[],"{'question_type': 'complex', 'seed_document_id...",Here are the adverse reactions reported during...,False,The agent provided some adverse reactions repo...
73e18ef5-254e-44e7-9171-26ca90d40210,Can you provide the list of inactive ingredien...,The inactive ingredients in Lipitor 40mg table...,Document 114: CALCIUM CARBONATE\n (UNII: H0G93...,[],"{'question_type': 'distracting element', 'seed...","Based on the provided document, I can share th...",False,The agent provided the inactive ingredients fo...
7cd4bf5b-e8b5-412d-9015-7fc4967b63d2,Can you please confirm the active ingredient a...,The active ingredient in the ALBUTEROL SULFATE...,Document 216: #\nItem Code\nPackage Descriptio...,[],"{'question_type': 'situational', 'seed_documen...",The active ingredient in the **Albuterol Sulfa...,False,The agent stated that the strength of the Albu...
b2c26aad-f9f9-4ee5-b69c-4c8ef13097ba,What are some of the risk factors for developi...,Risk factors for myopathy associated with LIPI...,Document 78: Select Azole Antifungals or Macro...,[],"{'question_type': 'situational', 'seed_documen...","Based on the provided context, here are key ri...",False,The agent provided detailed information about ...
92d517b2-54d4-44ce-a923-08c44e3c6ed3,What instructions should patients be given whe...,Patients should be instructed to take a repeat...,Document 361: Product: \n50090-2980\nNDC: \n50...,[],"{'question_type': 'double', 'original_question...",Here are the key instructions patients should ...,False,The agent provided additional instructions for...
c940d678-3141-4420-b01d-6ab2179e11d3,What should be done?,Patients prescribed ZOLOFT oral solution shoul...,Document 320: Suicidal Thoughts and Behaviors\...,"[{'role': 'user', 'content': 'Patients prescri...","{'question_type': 'conversational', 'seed_docu...",The provided document snippet lists pharmaceut...,False,The agent provided a general response about ph...


In [70]:
import pandas as pd

# Assuming `report.get_failures()` returns a list of dicts or objects
failures = report.get_failures()

# If it's already a list of dictionaries, this will work:
df_failures = pd.DataFrame(failures)

# Save to Excel
df_failures.to_excel("giskard_failures.xlsx", index=False)
print("Failures saved to giskard_failures.xlsx")

Failures saved to giskard_failures.xlsx
