In [None]:
import random

In [None]:
def monte_carlo_eval(prompt):
    # Simulating different types of responses
    response_types = ['highly relevant', 'somewhat relevant', 'irrelevant']
    scores = {'highly relevant': 3, 'somewhat relevant': 2, 'irrelevant': 1}

    # Perform multiple random trials
    trials = 100
    total_score = 0
    for _ in range(trials):
        response = random.choice(response_types)
        total_score += scores[response]

    # Average score represents the evaluation
    return total_score / trials

def elo_eval(prompt, base_rating=1500):
    # Simulate the outcome of the prompt against standard criteria
    # Here, we randomly decide if the prompt 'wins', 'loses', or 'draws'
    outcomes = ['win', 'loss', 'draw']
    outcome = random.choice(outcomes)

    # Elo rating formula parameters
    K = 30  # Maximum change in rating
    R_base = 10 ** (base_rating / 400)
    R_opponent = 10 ** (1600 / 400)  # Assuming a fixed opponent rating
    expected_score = R_base / (R_base + R_opponent)

    # Calculate the new rating based on the outcome
    actual_score = {'win': 1, 'loss': 0, 'draw': 0.5}[outcome]
    new_rating = base_rating + K * (actual_score - expected_score)

    return new_rating

In [None]:
def elo_ratings_func(prompts, elo_ratings, K=30, opponent_rating=1600):
    """
    Update Elo ratings for a list of prompts based on simulated outcomes.

    Parameters:
    prompts (list): List of prompts to be evaluated.
    elo_ratings (dict): Current Elo ratings for each prompt.
    K (int): Maximum change in rating.
    opponent_rating (int): Fixed rating of the opponent for simulation.

    Returns:
    dict: Updated Elo ratings.
    """

    for prompt in prompts:
        # Simulate an outcome against the standard criteria or another prompt
        outcome = random.choice(['win', 'loss', 'draw'])

        # Calculate the new rating based on the outcome
        actual_score = {'win': 1, 'loss': 0, 'draw': 0.5}[outcome]
        R_base = 10 ** (elo_ratings[prompt] / 400)
        R_opponent = 10 ** (opponent_rating / 400)
        expected_score = R_base / (R_base + R_opponent)
        elo_ratings[prompt] += K * (actual_score - expected_score)

    return elo_ratings

# Example usage
prompts = [
            "Who founded OpenAI?", 
            "What was the initial goal of OpenAI?",
            "What did OpenAI release in 2016?", 
            "What project did OpenAI showcase in 2018?",
            "How did the AI agents in OpenAI Five work together?"
                ]
elo_ratings = {prompt: 1500 for prompt in prompts}  # Initial ratings

# Conduct multiple rounds of evaluation
for _ in range(10):  # Number of rounds
    elo_ratings = elo_ratings_func(prompts, elo_ratings)

# Sort prompts by their final Elo ratings
sorted_prompts = sorted(prompts, key=lambda x: elo_ratings[x], reverse=True)

# Print the ranked prompts
for prompt in sorted_prompts:
    print(f"{prompt}: {elo_ratings[prompt]}")

In [None]:
How did the AI agents in OpenAI Five work together?: 1548.3188869745466
What did OpenAI release in 2016?: 1533.8606319951714
Who founded OpenAI?: 1533.4486254910844
What project did OpenAI showcase in 2018?: 1528.63542805577
What was the initial goal of OpenAI?: 1498.6276146134444

#### prompts evaluation

In [None]:
"What was the initial goal of OpenAI?": 1583.6551603182484
This prompt has the highest rating,  suggesting it was evaluated as the most relevant, accurate, or valuable.
"Who founded OpenAI?": 1550.8315837034786
This prompt also performed well, but slightly less so than the first one.
"What project did OpenAI showcase in 2018?": 1524.894352475904 Moderate
"What did OpenAI release in 2016?": 1518.8441077283887
These prompts have lower ratings, indicating they were evaluated as less relevant or valuable compared to the top-rated prompts.
"How did the AI agents in OpenAI Five work together?": 1501.4300442180024
This prompt is closer to the baseline rating, suggesting its performance was near average in your evaluation criteria.

In [None]:
def evaluate_prompt(main_prompt, test_cases):
    evaluations = {}

    # Evaluate the main prompt using Monte Carlo and Elo methods
    evaluations['main_prompt'] = {
        'Monte Carlo Evaluation': monte_carlo_eval(main_prompt),
        'Elo Rating Evaluation': elo_eval(main_prompt)
    }

    # Evaluate each test case
    for idx, test_case in enumerate(test_cases):
        evaluations[f'test_case_{idx+1}'] = {
            'Monte Carlo Evaluation': monte_carlo_eval(test_case),
            'Elo Rating Evaluation': elo_eval(test_case)
        }

    return evaluations

In [None]:
main_prompt = "why we use OepenAI?"
test_cases = ["Who founded OpenAI?", 
                "What was the initial goal of OpenAI?",
                "What did OpenAI release in 2016?", 
                "What project did OpenAI showcase in 2018?",
                "How did the AI agents in OpenAI Five work together?"
                ]
result = evaluate_prompt(main_prompt, test_cases)
print(result)

In [None]:
{'main_prompt': {'Monte Carlo Evaluation': 1.93, 'Elo Rating Evaluation': 1519.2019499940866}, 'test_case_1': {'Monte Carlo Evaluation': 1.97, 'Elo Rating Evaluation': 1504.2019499940866}, 'test_case_2': {'Monte Carlo Evaluation': 1.99, 'Elo Rating Evaluation': 1489.2019499940866}, 'test_case_3': {'Monte Carlo Evaluation': 2.01, 'Elo Rating Evaluation': 1519.2019499940866}, 'test_case_4': {'Monte Carlo Evaluation': 1.93, 'Elo Rating Evaluation': 1504.2019499940866}, 'test_case_5': {'Monte Carlo Evaluation': 2.0, 'Elo Rating Evaluation': 1504.2019499940866}}

#### Interpretation
##### 1. Monte Carlo Evaluation:
Scores Range: From 1 to 3, with higher scores indicating greater relevance or quality of the prompt.
###### Interpretation:
1.94 (Main Prompt): Slightly below average relevance or quality.
2.06, 2.02, 1.89, 1.98, 2.03 (Test Cases): Scores around 2 suggest moderate relevance or quality. The variation indicates some test cases are deemed slightly more relevant or higher quality than others.
#### 2. Elo Rating Evaluation:
Base Rating: Usually starts at 1500, with changes based on the 'performance' of the prompt against a set of standards.
Higher than 1500: Indicates the prompt performed better than average.
Lower than 1500: Indicates the prompt performed worse than average.
###### Interpretation:
1489.20 (Main Prompt): Slightly below the average performance.
1519.20 (Test Cases 1, 2, 4, 5): These prompts are rated above the average, suggesting better performance.
1504.20 (Test Case 3): Slightly above average performance.
#### Overall Interpretation:
Main Prompt: Both evaluations suggest that the main prompt is slightly below average in terms of relevance and quality.
Test Cases: Generally, the test cases are rated as average or slightly above average in both relevance and quality. Test Cases 1, 2, 4, and 5 seem to perform particularly well in the Elo evaluation, indicating they might be more effective or well-structured prompts compared to the main prompt and Test Case 3.

### RAGAS Evaluation

In [None]:
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter  
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv
# 
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [None]:
# Data loader
def data_loader(file_path= '../prompts/context.txt'):
    loader = TextLoader(file_path)
    documents = loader.load()

    # Chunk the data
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [None]:
def create_retriever(chunks):

  # Load OpenAI API key from .env file
  load_dotenv(find_dotenv())

  # Setup vector database
  client = weaviate.Client(
    embedded_options = EmbeddedOptions()
  )

  # Populate vector database
  vectorstore = Weaviate.from_documents(
      client = client,    
      documents = chunks,
      embedding = OpenAIEmbeddings(),
      by_text = False
  )

  # Define vectorstore as retriever to enable semantic search
  retriever = vectorstore.as_retriever()
  return retriever

In [None]:
import chunk
chunk

In [None]:
chunks =  data_loader()
retriever = create_retriever(chunks)