<a href="https://colab.research.google.com/github/Anushkaghei/Hallucination-Detection-In-LLMs/blob/main/Multi_agent_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install xmltodict
!pip install langchain_community
!pip install -q -U google-generativeai langchain-google-genai
%pip install --upgrade --quiet  langchain-google-genai pillow

In [2]:
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
from google.colab import userdata
#from langchain_community.retrievers import PubMedRetriever
from langchain.chains import RetrievalQA
from langchain.retrievers import PubMedRetriever
from langchain.agents.format_scratchpad.openai_tools import (format_to_openai_tool_messages,)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain.agents import AgentExecutor
from langchain.agents import tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_google_genai import ChatGoogleGenerativeAI

In [None]:
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key")

In [4]:
llm = ChatGoogleGenerativeAI(model="gemini-pro")
result = llm.invoke("Write a comprehensive report on it: Why is the Wernickie Area considered so important? It is such a wide area with limited study on it.")

In [5]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
to_markdown(result.content)

In [7]:
retriever = PubMedRetriever(
    index_name="my_index",
    batch_size=10,
    retrieve_kwargs={"top_k": 10},
)

In [None]:
model2 = genai.GenerativeModel('gemini-pro')

def augmented_generator(query):
  # 1. Retrieve relevant documents using PubMed Retriever
  documents = retriever.get_relevant_documents(query)

  prompts = ["Answer the question based on the given context: " + query + ""]
  for doc in documents:
    # Get title from metadata (assuming a 'title' key exists)
    title = doc.metadata.get("title", "")  # Handle potential missing key
    prompts.append(title + ": " + doc.page_content)

  augmented_text = model2.generate_content(prompts)

  return augmented_text

query = "Why is the Wernickie Area considered so important? It is such a wide area with limited study on it."
augmented_text = augmented_generator(query)

x = to_markdown(augmented_text.text)

x

In [None]:
!pip install rouge_score sacrebleu transformers

In [None]:
!pip install bert_score

In [None]:
!pip install questeval

In [None]:
from rouge_score import rouge_scorer
from sacrebleu import BLEU  # BLEU score from sacrebleu library
from transformers import BertTokenizer, BertModel
import bert_score

In [None]:
# Define the two text variables
x = result.content
y = augmented_text.text

# Calculate ROUGE-L score
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_score = scorer.score(x, y)['rougeL'].fmeasure
print(f"ROUGE-L Score: {rouge_l_score:.4f}")

# Calculate BERTScore
bert_scorer = bert_score.score([x], [y], lang='en', model_type='bert-base-uncased')
bert_score_value = bert_scorer[2].item()
print(f"BERTScore: {bert_score_value:.4f}")

# Calculate BLEU score
bleu_scorer = BLEU()
bleu_score = bleu_scorer.corpus_score([x], [[y]]).score
print(f"BLEU Score: {bleu_score:.4f}")

average_score = (rouge_l_score + bert_score_value) / 2
print(f"Average Score: {average_score:.4f}")

# Flag as hallucination if the average score is less than 0.5
if average_score < 0.5:
    print("Flagged as hallucination.")
else:
    print("Not flagged as hallucination.")


In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import bert_score
from sacrebleu import BLEU

rouge_l_scores = []
bert_scores = []
bleu_scores = []

# Read the CSV file
data = pd.read_csv('/content/drive/MyDrive/merged_final_dataset.csv')

# Initialize scorers
rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
bleu_scorer = BLEU()

# Iterate over rows
for _, row in data.iterrows():
    input_text = row['input']
    expected_output = row['output']

    # Generate outputs from LLMs x and y
    output_x = llm.invoke('Answer the question based on the given context: ' + input_text).content
    output_y = augmented_generator(input_text).text

    # Calculate scores
    rouge_l_score_x = rouge_scorer.score(output_x, expected_output)['rougeL'].fmeasure
    rouge_l_score_y = rouge_scorer.score(output_y, expected_output)['rougeL'].fmeasure

    bert_scorer_x = bert_score.score([output_x], [expected_output], lang='en', model_type='bert-base-uncased')
    bert_score_x = bert_scorer_x[2].item()
    bert_scorer_y = bert_score.score([output_y], [expected_output], lang='en', model_type='bert-base-uncased')
    bert_score_y = bert_scorer_y[2].item()

    bleu_score_x = bleu_scorer.corpus_score([output_x], [[expected_output]]).score
    bleu_score_y = bleu_scorer.corpus_score([output_y], [[expected_output]]).score

    # Print scores
    print(f"Input: {input_text}")
    print(f"Expected Output: {expected_output}")
    print(f"Output x: {output_x}")
    print(f"Output y: {output_y}")
    print(f"ROUGE-L Score x: {rouge_l_score_x:.4f}, ROUGE-L Score y: {rouge_l_score_y:.4f}")
    print(f"BERTScore x: {bert_score_x:.4f}, BERTScore y: {bert_score_y:.4f}")
    print(f"BLEU Score x: {bleu_score_x:.4f}, BLEU Score y: {bleu_score_y:.4f}")
    print("-" * 50)

    rouge_l_scores.append(rouge_l_score_x)
    bert_scores.append(bert_score_x)
    bleu_scores.append(bleu_score_x)

average_rouge_l_score = sum(rouge_l_scores) / len(rouge_l_scores)
print(f"Average ROUGE-L Score: {average_rouge_l_score:.4f}")

average_bert_score = sum(bert_scores) / len(bert_scores)
print(f"Average BERT Score: {average_bert_score:.4f}")

average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu_score:.4f}")
