# Deployment Testing & Evaluation

## Importing libraries and Keys

In [2]:
import os
import requests
import json
import time
import subprocess
from dotenv import load_dotenv
from langchain.evaluation.qa import QAEvalChain
from langchain_openai import ChatOpenAI

In [3]:
# Load environment variables from .env file
load_dotenv()
api_key = os.getenv('IH_OPENAI_API_KEY')


## Deployment Testing

In [2]:
url = 'http://localhost:5000/query'
headers = {'Content-Type': 'application/json'}

queries = [
    "how are you?",
    "tell me about the number 37?",
    "What's the significance of number 37?", # Similar Repeat question to check memory usage
    "where do you get this info from?",
    "Can you fetch me some YouTube video URLs about physics?",
    "Tell me about the speed limit in the universe.",
    "Can you summarize the video about imaginary numbers?",
    "Who is the president of Spain?",
    "Can you share a video about quantum computing?",
    "Summarize this video https://www.youtube.com/watch?v=vVKFBaaL4uM",
    "Who's Henrietta Leavitt?",
    "tell me about black holes.",
    "What is the relevance of black holes in quantum mechanics?",
    "What is the relevance of black holes in quantum mechanics?",  # Exact Repeat question to check memory usage
    "Fetch me a video explaining quantum entanglement.",
    "Fetch me a video explaining temperature changes.",
    "Who are some notable scientists in the field of quantum mechanics?"
]

for query in queries:
    payload = {
        'query': query,
        'context': ''
    }
    response = requests.post(url, json=payload, headers=headers)

    if response.status_code == 200:
        print(f"Query: {query}")
        print("Response from chatbot:", response.json())
        print(" ")
        print(" ")
        print("- " * 50)
    else:
        print(f"Failed to get a response for query: {query}. Status code:", response.status_code)
        print("- " * 50)


Query: how are you?
Response from chatbot: {'response': 'Based on the retrieved information, it seems like the person being referred to in the documents is doing well. They mention enjoying science over math, missing Canada, loving science to the moon and back, and not getting enough sleep. They also express gratitude for their team and sponsors, indicating a positive attitude overall.', 'source': 'Retriever'}
 
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Query: tell me about the number 37?
Response from chatbot: {'response': 'The number 37 seems to hold a significant place in human psychology, mathematics, and even random decision-making. It appears that people have a tendency to choose 37 when asked for a random number between 1 and 100, which is known as the "blue 37 phenomenon." This number has also been found to be commonly chosen by computer programmers and in various surveys, making it stand out as a curious preference.\

In [3]:
url = 'http://localhost:5000/query'
headers = {'Content-Type': 'application/json'}

# Define test queries
test_queries = [
    "Who is the president of Spain?",  # Should trigger Wikipedia as retriever may not have this info
    "Who is Henrietta Leavitt?",       # Should trigger Wikipedia as retriever may not have this info
    "What is the capital of senegal?",  # General question to test Wikipedia fallback
    "Tell me about falafel",  # General question to test Wikipedia fallback
    "Disambiguation test"              # Intentionally ambiguous to trigger disambiguation error handling
]

def test_query(query):
    payload = {
        'query': query,
        'context': ''
    }

    response = requests.post(url, json=payload, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to get a response. Status code: {response.status_code}"}

for query in test_queries:
    result = test_query(query)
    print(f"\nQuery: {query}")
    print(f"Response: {result}")




Query: Who is the president of Spain?
Response: {'response': "From Memory: I couldn't find relevant information in Veritasium's YouTube channel, but here's some information from Wikipedia:\n\nThe Congress of Deputies (Spanish: Congreso de los Diputados) is the lower house of the Cortes Generales, Spain's legislative branch, the upper house being the Senate. The Congress meets in the Palace of the Parliament (Palacio de las Cortes) in Madrid.\nCongress has 350 members elected from fifty-two constituencies (the fifty provinces and two autonomous cities) using closed list D'Hondt proportional representation. Deputies serve four-year terms. The presiding officer and speaker is the President of the Congress of Deputies, who is elected by the members at the first sitting of Congress after an election.\nThe two principle actors in Congress are parliamentary groups and parliamentary committees (Spanish: comissiones). All MPs are required to be members of a parliamentary group, the institution

In [5]:
url = 'http://localhost:5000/query'
headers = {'Content-Type': 'application/json'}

# Define test queries
test_queries = [
    "Can you summarize the video about imaginary numbers?",
    "Fetch me a video explaining quantum entanglement.",
    "Fetch me a video about spiders.",
    "Can you fetch me some YouTube video URLs about physics?"
]

def test_query(query):
    payload = {
        'query': query,
        'context': ''
    }

    response = requests.post(url, json=payload, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to get a response. Status code: {response.status_code}"}

for query in test_queries:
    result = test_query(query)
    print(f"\nQuery: {query}")
    print(f"Response: {result}")



Query: Can you summarize the video about imaginary numbers?
Response: {'response': "The video discusses the historical development of mathematics, particularly focusing on the solution to the cubic equation. It starts with the challenges faced by ancient civilizations in solving the cubic equation and how mathematicians eventually separated algebra from geometry to find solutions. The narrative follows the journey of mathematicians like Scipione del Ferro and Niccolò Fontana Tartaglia, who made significant contributions to solving cubic equations. The video details the math duel between Fior and Tartaglia, leading to Tartaglia's discovery of a method to solve depressed cubics. It then delves into the story of Gerolamo Cardano, who learns Tartaglia's method and further develops it to solve the full cubic equation. The video highlights the introduction of complex numbers, the role of imaginary numbers, and their application in physics, specifically in the Schrödinger equation and quantu

In [60]:
url = 'http://localhost:5000/query'
headers = {'Content-Type': 'application/json'}

# Define test queries
test_queries = [
    "Summarize this video about jumping spiders and color",

]

def test_query(query):
    payload = {
        'query': query,
        'context': ''
    }

    response = requests.post(url, json=payload, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to get a response. Status code: {response.status_code}"}

for query in test_queries:
    result = test_query(query)
    print(f"\nQuery: {query}")
    print(f"Response: {result}")



Query: Summarize this video about jumping spiders and color
Response: {'response': "This video discusses the perception of color and its significance in the animal kingdom, focusing on jumping spiders. It delves into how different animals perceive color, the evolution of color vision, and the unique abilities of jumping spiders in terms of color perception. Researchers study jumping spiders' retinas to understand their color vision capabilities, including the presence of dichromats, trichromats, and tetrachromats among different species. The video also highlights experiments showcasing how color cues play a role in feeding behavior and mating displays among jumping spiders. The evolutionary advantages of expanded color vision are explored, shedding light on the intricate relationship between color perception, behavior, and genetic adaptations in these fascinating arachnids.", 'source': 'VideoSummarizerAgent'}


## Evaluation

In [26]:
# Load the JSON file
with open('./data/4-reference_responses.json', 'r') as f:
    queries_responses = json.load(f)

In [28]:
queries_responses[:2]


[{'query': 'how are you?',
  'reference_response': 'I am an AI assistant programmed to provide information and assistance. Thank you for asking!'},
 {'query': 'tell me about the number 37?',
  'reference_response': 'The number 37 seems to hold a special place in people\'s minds for various reasons, as highlighted in the retrieved information from Veritasium videos. People tend to gravitate towards 37 when asked to pick a random number between 1 and 100, much like the "blue seven phenomenon" where black, blue, and seven are commonly selected. This preference for 37 extends to computer programmers and even in magic tricks like the "37 force." \n\nMoreover, mathematically, 37 has significance in the context of the secretary problem or the marriage problem, where exploring and rejecting 37% of options can lead to optimal decision-making. The number 37 also holds importance in prime factorization analysis, being the median second prime factor of all numbers, showcasing its unique mathematic

In [30]:
# URL and headers for the local server
url = 'http://localhost:5000/query'
headers = {'Content-Type': 'application/json'}

# Function to generate chatbot responses using HTTP POST requests
def generate_response(query):
    payload = {
        'query': query,
        'context': ''  # Adjust if additional context is needed
    }
    response = requests.post(url, json=payload, headers=headers)
    if response.status_code == 200:
        return response.json().get('response', 'No response field in JSON')
    else:
        return f"Error: Failed to get a response. Status code: {response.status_code}"

In [60]:
# # Load the JSON file with existing evaluation results
results_file_path = './data/5-evaluation_results.json'
try:
    with open(results_file_path, 'r') as f:
        all_results = json.load(f)
except FileNotFoundError:
    all_results = []

# Prepare data for evaluation
evaluation_data = []
for item in queries_responses:
    query = item['query']
    chatbot_response = generate_response(query)
    reference_response = item['reference_response']
    item['chatbot_response'] = chatbot_response  # Store generated response for evaluation

    # Prepare data for evaluation
    evaluation_data.append({
        'query': query,
        'reference': reference_response,
        'prediction': chatbot_response
    })

# Initialize QAEvalChain with the LLM instance
llm = ChatOpenAI(api_key=api_key)
qa_eval_chain = QAEvalChain.from_llm(llm)

# Evaluate the responses
eval_results = qa_eval_chain.evaluate(examples=evaluation_data, predictions=evaluation_data, question_key='query', answer_key='reference', prediction_key='prediction')

# Append evaluation results to the original data
results = []
for item, eval_result in zip(queries_responses, eval_results):
    item['evaluation'] = eval_result
    results.append(item)

# Append new results to existing results
all_results.extend(results)

# Save all results to a JSON file
with open(results_file_path, 'w') as f:
    json.dump(all_results, f, indent=4)

# Print a summary of the evaluation
success_count = sum(1 for result in all_results if result['evaluation']['results']=="CORRECT")
total_evaluations = len(all_results)
average_success = success_count / total_evaluations

print(f"Average Success Rate: {average_success * 100}%")

Average Success Rate: 72.38095238095238%


In [66]:
# Initialize a dictionary to store success counts for each query
query_success_count = {result['query']: {'total': 0, 'correct': 0} for result in all_results}

# Calculate total and correct counts for each query
for result in all_results:
    query = result['query']
    evaluation_result = result['evaluation']['results']
    query_success_count[query]['total'] += 1
    if evaluation_result == 'CORRECT':
        query_success_count[query]['correct'] += 1

# Calculate and print the average success rate
success_count = sum(1 for result in all_results if result['evaluation']['results'] == 'CORRECT')
total_evaluations = len(all_results)
average_success = success_count / total_evaluations
print(f"Average Success Rate: {average_success * 100:.2f}%\n")

# Print each query and its success rate
for query, counts in query_success_count.items():
    total = counts['total']
    correct = counts['correct']
    success_rate = (correct / total) * 100 if total > 0 else 0
    print(f"Query: {query}")
    print(f"Success Rate: {success_rate:.2f}%\n")

Average Success Rate: 72.38%

Query: how are you?
Success Rate: 42.86%

Query: tell me about the number 37?
Success Rate: 100.00%

Query: where do you get this info from?
Success Rate: 100.00%

Query: Can you fetch me some YouTube video URLs about physics?
Success Rate: 85.71%

Query: Tell me about the speed limit in the universe.
Success Rate: 100.00%

Query: Can you summarize the video about imaginary numbers?
Success Rate: 100.00%

Query: Who is the president of Spain?
Success Rate: 42.86%

Query: Can you share a video about quantum computing?
Success Rate: 28.57%

Query: Summarize this video https://www.youtube.com/watch?v=vVKFBaaL4uM
Success Rate: 100.00%

Query: Who's Henrietta Leavitt?
Success Rate: 100.00%

Query: tell me about black holes.
Success Rate: 85.71%

Query: What is the relevance of black holes in quantum mechanics?
Success Rate: 85.71%

Query: Fetch me a video explaining quantum entanglement.
Success Rate: 28.57%

Query: give me a video about temperatures.
Success R