In [1]:
%pip install --upgrade --quiet  langchain langchain-openai langchain_community datasets pyarrow==15.0.2 google-search-results scipy

In [2]:
import os
from google.colab import userdata

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
SERP_API_KEY = userdata.get('SERP_API_KEY')


In [3]:
 os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
 os.environ['SERP_API_KEY'] = SERP_API_KEY
#  os.getenv('OPENAI_API_KEY')

In [4]:
# Create the Evaluator
from langchain.evaluation import load_evaluator

eval_chain = load_evaluator("pairwise_string")

In [5]:
type(eval_chain)

In [6]:
from langchain.evaluation.loading import load_dataset

dataset = load_dataset("langchain-howto-queries")

In [7]:
len(dataset)

50

In [25]:
from langchain.agents import AgentType, Tool, initialize_agent
from langchain_community.utilities import SerpAPIWrapper
from langchain_openai import ChatOpenAI

# Initialize the language model
# You can add your own OpenAI API key by adding openai_api_key="<your_api_key>"
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo",api_key=os.getenv('OPENAI_API_KEY'))

# Initialize the SerpAPIWrapper for search functionality
# Replace <your_api_key> in openai_api_key="<your_api_key>" with your actual SerpAPI key.
search = SerpAPIWrapper(serpapi_api_key=SERP_API_KEY)

# Define a list of tools offered by the agent
tools = [
    Tool(
        name="Search",
        func=search.run,
        coroutine=search.arun,
        description="Useful when you need to answer questions about current events. You should ask targeted questions.",
    ),
]

In [26]:
functions_agent = initialize_agent(
    tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False
)
conversations_agent = initialize_agent(
    tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False
)

In [27]:
import asyncio

from tqdm.notebook import tqdm

results = []
agents = [functions_agent, conversations_agent]
concurrency_level = 6  # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.

# We will only run the first 20 examples of this dataset to speed things up
# This will lead to larger confidence intervals downstream.
batch = []
for example in tqdm(dataset[:20]):
    batch.extend([agent.acall(example["inputs"]) for agent in agents])
    if len(batch) >= concurrency_level:
        batch_results = await asyncio.gather(*batch, return_exceptions=True)
        results.extend(list(zip(*[iter(batch_results)] * 2)))
        batch = []
if batch:
    batch_results = await asyncio.gather(*batch, return_exceptions=True)
    results.extend(list(zip(*[iter(batch_results)] * 2)))

  0%|          | 0/20 [00:00<?, ?it/s]

In [28]:
results

[({'input': 'Can I use LangChain to automatically rate limit or retry failed API calls?',
   'output': 'LangChain is a framework designed to simplify the creation of applications using large language models. It provides AI developers with tools to connect language models with external data sources and is supported by an active community. However, it is not specifically designed for automatically rate limiting or retrying failed API calls. \n\nTo automatically rate limit or retry failed API calls, you may need to use other tools or libraries that are specifically designed for that purpose, such as Axios for JavaScript or requests library for Python. These tools provide features for handling rate limiting, retries, and other HTTP request-related functionalities.'},
  ValueError('An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: Final 

In [32]:
for example, (res_a, res_b) in zip(dataset, results):
  print(example["inputs"])
  print(res_a)
  print("#"*20)
  print(res_b)
  break

Can I use LangChain to automatically rate limit or retry failed API calls?
{'input': 'Can I use LangChain to automatically rate limit or retry failed API calls?', 'output': 'LangChain is a framework designed to simplify the creation of applications using large language models. It provides AI developers with tools to connect language models with external data sources and is supported by an active community. However, it is not specifically designed for automatically rate limiting or retrying failed API calls. \n\nTo automatically rate limit or retry failed API calls, you may need to use other tools or libraries that are specifically designed for that purpose, such as Axios for JavaScript or requests library for Python. These tools provide features for handling rate limiting, retries, and other HTTP request-related functionalities.'}
####################
An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True`

In [50]:
random.random()

0.879361988265433

In [58]:
import random


def predict_preferences(dataset, results) -> list:
    preferences = []

    for example, (res_a, res_b) in zip(dataset, results):
        input_ = example["inputs"]
        # Flip a coin to reduce persistent position bias
        if random.random() < 0.5:
            pred_a, pred_b = res_a, res_b
            a, b = "a", "b"
        else:
            pred_a, pred_b = res_b, res_a
            a, b = "b", "a"
        eval_res = eval_chain.evaluate_string_pairs(
            prediction=pred_a["output"] if isinstance(pred_a, dict) else str(pred_a),
            prediction_b=pred_b["output"] if isinstance(pred_b, dict) else str(pred_b),
            input=input_,
        )
        print(eval_res)
        print("pred_a: ", pred_a)
        print("*"*20)
        print("pred_b: ", pred_b)
        if eval_res["value"] == "A":
            preferences.append(a)
        elif eval_res["value"] == "B":
            preferences.append(b)
        else:
            preferences.append(None)  # No preference
    return preferences

In [59]:
preferences = predict_preferences(dataset[:1], results)

{'reasoning': "Assistant A's response is not helpful or relevant to the user's question. It seems to be an error message rather than an answer. On the other hand, Assistant B provides a clear and accurate response, explaining what LangChain is and suggesting other tools that can be used for rate limiting or retrying failed API calls. Therefore, Assistant B's response is more helpful, relevant, accurate, and demonstrates a greater depth of thought. \n\nFinal verdict: [[B]]", 'value': 'B', 'score': 0}
pred_a:  An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: Final Answer
********************
pred_b:  {'input': 'Can I use LangChain to automatically rate limit or retry failed API calls?', 'output': 'LangChain is a framework designed to simplify the creation of applications using large language models. It provides AI developers with too

In [57]:
preferences

['a']

In [38]:
preferences

['a',
 'a',
 'a',
 'a',
 'b',
 'a',
 'b',
 'a',
 'a',
 'a',
 'b',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'b',
 'a']

In [40]:
from collections import Counter

name_map = {
    "a": "OpenAI Functions Agent",
    "b": "Structured Chat Agent",
}
counts = Counter(preferences)
pref_ratios = {k: v / len(preferences) for k, v in counts.items()}
for k, v in pref_ratios.items():
    print(f"{name_map.get(k)}: {v:.2%}")

OpenAI Functions Agent: 80.00%
Structured Chat Agent: 20.00%


In [39]:
Counter(preferences)

Counter({'a': 16, 'b': 4})

In [41]:
from math import sqrt


def wilson_score_interval(
    preferences: list, which: str = "a", z: float = 1.96
) -> tuple:
    """Estimate the confidence interval using the Wilson score.

    See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
    for more details, including when to use it and when it should not be used.
    """
    total_preferences = preferences.count("a") + preferences.count("b")
    n_s = preferences.count(which)

    if total_preferences == 0:
        return (0, 0)

    p_hat = n_s / total_preferences

    denominator = 1 + (z**2) / total_preferences
    adjustment = (z / denominator) * sqrt(
        p_hat * (1 - p_hat) / total_preferences
        + (z**2) / (4 * total_preferences * total_preferences)
    )
    center = (p_hat + (z**2) / (2 * total_preferences)) / denominator
    lower_bound = min(max(center - adjustment, 0.0), 1.0)
    upper_bound = min(max(center + adjustment, 0.0), 1.0)

    return (lower_bound, upper_bound)

In [42]:
for which_, name in name_map.items():
    low, high = wilson_score_interval(preferences, which=which_)
    print(
        f'The "{name}" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'
    )

The "OpenAI Functions Agent" would be preferred between 58.40% and 91.93% percent of the time (with 95% confidence).
The "Structured Chat Agent" would be preferred between 8.07% and 41.60% percent of the time (with 95% confidence).


In [17]:
# !pip install --upgrade scipy

In [48]:
from scipy import stats

preferred_model = max(pref_ratios, key=pref_ratios.get)
successes = preferences.count(preferred_model)
n = len(preferences) - preferences.count(None)
p_value = stats.binomtest(successes, n, p=0.5, alternative="two-sided").pvalue
print(
    f"""The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),
then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}
times out of {n} trials."""
)

The p-value is 0.01182. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),
then there is a 1.18179% chance of observing the OpenAI Functions Agent be preferred at least 16
times out of 20 trials.


In [None]:
pref_ratios