In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
MODEL = "llama2"

### Scrape the website and split the content ###

In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

loader = WebBaseLoader("https://www.nhs.uk/medicines/aciclovir/about-aciclovir/")
documents = loader.load_and_split(text_splitter)
documents

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.nhs.uk/medicines/aciclovir/about-aciclovir/', 'title': 'About aciclovir - NHS', 'description': "NHS medicines information on aciclovir – what it's used for and key facts.", 'language': 'en'}, page_content='About aciclovir - NHS\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch the NHS website\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n                Health A to Z\n                \n\n\n\n\n\n\n                NHS services\n                \n\n\n\n\n\n\n                Live Well\n                \n\n\n\n\n\n\n                Mental health\n                \n\n\n\n\n\n\n                Care and support\n                \n\n\n\n\n\n\n                Pregnancy\n                \n\n\n\n\n\n\n                Home\n                \n\n\n\n\n\n\nBrowse\n                More\n                \n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\n\nHealth A to Z\n\n\nMedicines A to 

In [3]:
len(documents)

4

### Load the content in a vector store ###

In [4]:
from huggingface_hub import login

login(token=os.getenv("TOKEN"))

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # Adjust with the exact model you're using
vectorstore = DocArrayInMemorySearch.from_documents(documents, embedding=embedding_model)
print(vectorstore)

<langchain_community.vectorstores.docarray.in_memory.DocArrayInMemorySearch object at 0x000001B38020AB90>


### Create a knowledge base ###

In [7]:
import pandas as pd

df = pd.DataFrame([d.page_content for d in documents], columns=["text"])

df.head(10)


Unnamed: 0,text
0,About aciclovir - NHS\n\n\n\n\n\n\n\n\n\n\n\n\...
1,Aciclovir (or acyclovir) is an antiviral medic...
2,Key facts\nStart taking or using aciclovir as ...
3,"Pregnancy, breastfeeding and fertility\n ..."


In [8]:
import giskard

# Setarea endpointului local pentru Ollama
api_base = "http://localhost:11434"

# Setează modelul principal LLM (ex: Qwen2.5 sau llama2, în funcție de ce ai instalat în Ollama)
giskard.llm.set_llm_model("ollama/llama2", disable_structured_output=True, api_base=api_base)

# Setează modelul de embedding (ex: nomic-embed-text, disponibil prin Ollama)
giskard.llm.set_embedding_model("ollama/nomic-embed-text", api_base=api_base)



In [9]:
import nest_asyncio
nest_asyncio.apply()

In [10]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(df)


### Generate the Test set ###

In [11]:
from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base,
    num_questions=2,
    agent_description="A chatbot answerig questions anout medicine drugs",
)

2025-05-05 16:07:09,522 pid:19932 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2025-05-05 16:07:21,128 pid:19932 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


Generating questions:  50%|█████     | 1/2 [00:36<00:36, 36.60s/it]

2025-05-05 16:08:36,599 pid:19932 MainThread giskard.rag  ERROR    Encountered error in question generation: 'question'. Skipping.
2025-05-05 16:08:36,600 pid:19932 MainThread giskard.rag  ERROR    'question'
Traceback (most recent call last):
  File "c:\Users\dunca\OneDrive\Desktop\Disertation\LLM\venv\lib\site-packages\giskard\rag\question_generators\base.py", line 59, in generate_questions
    yield self.generate_single_question(knowledge_base, *args, **kwargs, seed_document=doc)
  File "c:\Users\dunca\OneDrive\Desktop\Disertation\LLM\venv\lib\site-packages\giskard\rag\question_generators\simple_questions.py", line 108, in generate_single_question
    question=generated_qa["question"],
KeyError: 'question'


Generating questions:  50%|█████     | 1/2 [01:15<01:15, 75.47s/it]


In [12]:
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("------------------------------", end="\n\n")

Question 1: What medicines are safe to take during pregnancy?
Reference answer: It is important to consult with a healthcare professional before taking any medication during pregnancy. The NHS website provides information on which medications are safe to use during pregnancy, including aciclovir. However, it is always best to seek personalized advice from a medical professional.
Reference context:
Document 3: Pregnancy, breastfeeding and fertility
                        



                          Taking or using aciclovir with other medicines and herbal supplements
                        



                          Common questions
                        








        Page last reviewed: 1 July 2022
        
        Next review due: 1 July 2025
      






Support links



Home


Health A to Z


Live Well


Mental health


Care and support


Pregnancy


NHS services


COVID-19




NHS App


Find my NHS number


View your GP health record


View your test results


About the

In [13]:
testset.save("test-set.jsonl")

In [14]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))



Answer the question based on the context below. If you can't
answer the question, reply "I don't know".

Context: Here is some context

Question: Here is a question

