# Setup

In [1]:
!pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install openai --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import datasets
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer
from tenacity import retry, wait_random_exponential, stop_after_attempt
import os
from openai import OpenAI
import random
import time
from termcolor import colored
import json
from IPython.display import display, Markdown, Latex
import ast

Testing the MMLU dataset before writing a function

In [7]:
dataset = datasets.load_dataset('cais/mmlu','all')

In [8]:
df = pd.DataFrame(dataset['test'])
df.head()

Unnamed: 0,question,subject,choices,answer
0,Find the degree for the given field extension ...,abstract_algebra,"[0, 4, 2, 6]",1
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...",abstract_algebra,"[8, 2, 24, 120]",2
2,Find all zeros in the indicated finite field o...,abstract_algebra,"[0, 1, 0,1, 0,4]",3
3,Statement 1 | A factor group of a non-Abelian ...,abstract_algebra,"[True, True, False, False, True, False, False,...",1
4,Find the product of the given polynomials in t...,abstract_algebra,"[2x^2 + 5, 6x^2 + 4x + 6, 0, x^2 + 1]",1


In [10]:


num_questions = len(df)
random_questions = random.sample(range(num_questions), 20)

df_selected = df.iloc[random_questions]
df_selected


Unnamed: 0,question,subject,choices,answer
13416,Butler & Stokes (1969) suggested that working ...,sociology,[socialization into working class families and...,0
11886,A landowner owned a large building in the city...,professional_law,[The non-assignability provision had no legal ...,0
11568,A man was paroled after serving five years in ...,professional_law,[He did not intend to use the gun for an unlaw...,2
5136,A piano teacher is helping a student learn a n...,high_school_psychology,"[Negative reinforcement, Negative punishment, ...",2
10151,Based on the analysis of oxygen isotope ratios...,prehistory,[It became increasingly complex due to increas...,1
6296,The paradox of well-being refers to the findin...,human_aging,"[Unhappy people, Older adults, People with ver...",1
7061,There are three main types of buying situatio...,marketing,"[Repeat purchases., Buyphases., Buyclasses., T...",2
12280,A 20-year-old female presents to the emergency...,professional_medicine,"[abdominal flat plate radiography, amylase lev...",3
1938,Inintel 8085A microprocessor ALE signal is mad...,electrical_engineering,[Enable the data bus to be used as low order a...,0
13476,Warner's study of the city of Natchez in the A...,sociology,[no longer a powerful influence upon race rela...,3


# Subset MMLU

In [11]:
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def subset_MMLU_test(num=20):
  """
  creates a subset of the MMLU Dataset's test split with the chosen number of questions. Default is 20
  """
  dataset = datasets.load_dataset('cais/mmlu','all')
  df = pd.DataFrame(dataset['test'])
  num_questions = len(df)
  random_questions = random.sample(range(num_questions), num)
  df_selected = df.iloc[random_questions]
  df_selected.to_csv("MMLU_random_20.csv")
  return df_selected


In [None]:
df=subset_MMLU_test()
df

Unnamed: 0,question,subject,choices,answer
3806,"In Gideon v. Wainwright, the Supreme Court rul...",high_school_government_and_politics,"[to representation by an attorney, not to incr...",0
7342,"Regarding exons, which, if any, of the followi...",medical_genetics,[Some exons in protein-coding genes consist of...,0
12463,Thomas and Chess’s (1977) “goodness-of-fit” mo...,professional_psychology,[a mismatch between the child’s basic temperam...,0
3297,This question refers to the following informat...,high_school_european_history,"[the role of women in army regiments, the runn...",3
8746,For which of these two scenarios does the main...,moral_scenarios,"[Wrong, Wrong, Wrong, Not wrong, Not wrong, Wr...",3
12086,A police detective received an anonymous call ...,professional_law,"[No, because the detective had probable cause ...",2
12305,A 13-month-old child is brought to the emergen...,professional_medicine,"[Hepatitis, Influenza, Pertussis, Poliomyelitis]",1
227,Where are the vital centres located in the bra...,anatomy,"[midbrain, pons, medulla oblongata, cerebellum]",2
4731,A student eats 3 slices of pizza while studyin...,high_school_microeconomics,"[The student would not eat any more pizza., Th...",3
12614,Which of the following best describes elaborat...,professional_psychology,"[making new information meaningful, consciousl...",0


# Formatting prompts

In [40]:
def formatting_prompt(question, choices, subject):
    prompt = f"""this question is from {subject}
    {question}
    0. {choices[0]}
    1. {choices[1]}
    2. {choices[2]}
    3. {choices[3]}
    Respond with the correct answer from the given choices. DO NOT GIVE EXPLANATIONS. Just the index of the choice and the choice itself. Nothing else.\n"""
    return prompt

In [41]:
def create_formatted_prompts(dataframe=None, file_path="MMLU_random_20.csv"):
    formatted_prompts = []

    try:
        # Try reading from the file
        df = pd.read_csv(file_path)
        column_names = df.columns.tolist()

        # Check if the required columns exist in the file
        if 'question' in column_names and 'choices' in column_names and 'subject' in column_names:
            for i in range(len(df)):
                question = df['question'].iloc[i]
                choices = ast.literal_eval(df['choices'].iloc[i])
                subject = df['subject'].iloc[i]
                prompt = formatting_prompt(question, choices, subject)
                formatted_prompts.append(prompt)
        else:
            print(f"Error: One or more required columns ('question', 'choices', 'subject') not found in {file_path}")

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")

        # If the file is not found or doesn't have the required columns, use the provided dataframe
        if dataframe is not None:
            column_names = dataframe.columns.tolist()

            if 'question' in column_names and 'choices' in column_names and 'subject' in column_names:
                for i in range(len(dataframe)):
                    question = dataframe['question'].iloc[i]
                    choices = dataframe['choices'].iloc[i]
                    subject = dataframe['subject'].iloc[i]
                    prompt = formatting_prompt(question, choices, subject)
                    formatted_prompts.append(prompt)
            else:
                print("Error: One or more required columns ('question', 'choices', 'subject') not found in the provided dataframe.")

    messages = []
    for question in formatted_prompts:
        subMessages = []
        subMessages.insert(1, {"role": "user", "content": question})
        messages.append(subMessages)

    return messages

Unnamed: 0.1,Unnamed: 0,question,subject,choices,answer
0,2134,Write the prime factorization of 77. Use expon...,elementary_mathematics,"['7 • 11^2 • 13', '7^2 • 11', '7 • 11 • 13', '...",3
1,9877,Nussbaum claims that the specification of part...,philosophy,"['will always consist of a single answer.', 'm...",1
2,7393,In the film 'The Talented Mr Ripley' who plays...,miscellaneous,"['Jude Law', 'Matt Damon', 'Dustin Hoffman', '...",1
3,12951,A person who is low in self-monitoring (Snyder...,professional_psychology,"['his/her life scripts.', 'his/her own feeling...",1
4,11756,A businessman was the owner of an idyllic lake...,professional_law,"['The interest created is a reverter, and the ...",1
5,7481,What does the Latin phrase 'e pluribus unum' m...,miscellaneous,"['What a crazy life', 'In God we trust', 'from...",2
6,3108,A stock solution of 12.0 M sulfuric acid is ma...,high_school_chemistry,"['Add 33.3 mL of water to the flask, and then ...",3
7,568,Hydrogen ions are formed when:,clinical_knowledge,"['glycogen becomes depleted.', 'phosphocreatin...",3
8,11579,A law school advertised in various law journal...,professional_law,['parol evidence is admissible to show that th...,3
9,6741,Which of the following describes the fallacy o...,logical_fallacies,['attacking the character or background of som...,2


In [None]:
prompt_list=create_formatted_prompts(df)
print(prompt_list)

[[{'role': 'user', 'content': 'this question is from elementary_mathematics\n    Write the prime factorization of 77. Use exponents where possible.\n    A. 7 • 11^2 • 13\n    B. 7^2 • 11\n    C. 7 • 11 • 13\n    D. 7 • 11\n    Respond with the correct answer from the given choices.\n'}], [{'role': 'user', 'content': 'this question is from philosophy\n    Nussbaum claims that the specification of particular virtues:\n    A. will always consist of a single answer.\n    B. might sometimes turn out to be a disjunction.\n    C. is necessarily relative to a culture.\n    D. can be settled in such a way as to be no longer revisable.\n    Respond with the correct answer from the given choices.\n'}], [{'role': 'user', 'content': "this question is from miscellaneous\n    In the film 'The Talented Mr Ripley' who plays Mr Ripley?\n    A. Jude Law\n    B. Matt Damon\n    C. Dustin Hoffman\n    D. Ben Affleck\n    Respond with the correct answer from the given choices.\n"}], [{'role': 'user', 'conte

In [29]:
os.environ["OPENAI_API_KEY"] = ""

In [23]:
# Create the OpenAI client
client = OpenAI()

In [None]:
model="gpt-3.5-turbo"

# prompting model and storing their responses

In [42]:
def prompt_model_and_store_response(prompt_list,model):
  responses = []
  for prompt in prompt_list:
    response = client.chat.completions.create(model=model, messages=prompt)
    response_content = response.choices[0].message.content
    print(response_content)
    responses.append(response_content)
  return responses


In [None]:
len(gpt_35_turbo_responses)

20

# Formatting prompts for the Judge LLM

In [44]:
def formatting_prompt_judge(question, choices, subject,answer):
    prompt = f"""
    this question is from {subject}
    {question}
    The correct answer is {choices[answer]}
    Check the LLMs response and respond with 0 and 1 for incorrect and correct answers respectively\n"""
    return prompt

In [45]:
def create_formatted_prompts_judge(df,response_list):
  system_message="""
  You are judgeGPT. Your objective is to correctly score the responses of other assistants.
  You would be given the question, the subject it is from, and the correct answer by the user.
  Respond with 0 or 1, to denote wrong answers and correct answers respectively. Check the index of your answer and the LLMs answer.
  Do NOT respond with anything except 0 or 1.
  You should be ethical and unbiased.
  DO NOT ANSWER THE QUESTION. GRADE THE ANSWER OF THE ASSISTANT
  """
  formatted_prompts = []
  for i in range(len(df)):
      question = df['question'].iloc[i]
      choices = df['choices'].iloc[i]
      subject = df['subject'].iloc[i]
      answer =df['answer'].iloc[i]
      prompt = formatting_prompt_judge(question, choices, subject,answer)
      formatted_prompts.append(prompt)

  messages = []
  for i in range(len(formatted_prompts)):
      subMessages = []
      subMessages.insert(1,{"role": "system", "content": system_message})
      subMessages.insert(2,{"role": "user", "content": formatted_prompts[i]})
      subMessages.insert(3,{"role": "assistant", "content": response_list[i]})
      messages.append(subMessages)
  return messages

In [None]:
test_prompts= create_formatted_prompts_judge(df,gpt_35_turbo_responses)

In [None]:
print(test_prompts)

[[{'role': 'system', 'content': '\n  You are judgeGPT. Your objective is to correctly score the responses of other assistants. \n  You would be given the question, the subject it is from, and the correct answer by the user.\n  Respond with 0 or 1, to denote wrong answers and correct answers respectively. \n  Do NOT respond with anything except 0 or 1.\n  You should be ethical and unbiased.\n  DO NOT ANSWER THE QUESTION. GRADE THE ANSWER OF THE ASSISTANT\n  '}, {'role': 'user', 'content': '\n    this question is from professional_law\n    A brother and a sister purchased land under a deed that conveyed title to them as joint tenants with right ofsurvivorship. Common law joint tenancy is unmodified by statute in the jurisdiction. The purchase price was $50,000, of which the sister paid $10,000 and the brother paid $40,000. The sisterlater mortgaged her interest in the land. The brother then died testate, leaving his entire estate to a cousin. Thesister later paid off her mortgage debt, a

In [None]:
len(test_prompts)

20

# Asking JudgeGPT to evaluate the answers of LLM

In [46]:
def judgements_of_judgeGPT(response_list,model):
  judgement_list = []
  for prompt in response_list:
    judgement = client.chat.completions.create(model=model, messages=prompt)
    response_content = judgement.choices[0].message.content
    print(response_content)
    judgement_list.append(response_content)

  try:
        judgement_list = [int(x) for x in judgement_list]
  except ValueError:
      print("One or more elements in the list could not be converted to integers.")

  return judgement_list


In [None]:
model1="gpt-4-turbo"

In [None]:
gpt_35_turbo_judgments_by_gpt4_Turbo=judgements_of_judgeGPT(test_prompts,model1)

0
0
1
1
0
1
1
0
0
1
0
1
0
1
1
1
0
1
0
0


In [None]:
print(gpt_35_turbo_judgments_by_gpt4_Turbo)

[0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]


## Adding the total score

In [47]:
def evaluate_results(judgement_list):
  result=0
  for i in judgement_list:
    result+=i
  return result

In [None]:
final_result=evaluate_results(gpt_35_turbo_judgments_by_gpt4_Turbo)

In [None]:
print(final_result)

10


## Everything together to compare three models

This worked here. Happy to share the results.

In [19]:
#keeping gpt-4 as the evaluator and gpt-4o, gpt-3.5-turbo and gpt-4-turbo as the models to be evaluated.
model1="gpt-4-turbo"
model2="gpt-3.5-turbo"
model3="gpt-4o"
model4="gpt-4"

In [52]:
#added sleep to get away from rate limit error

def random_dataset_to_prompting_judging_and_evaluating(model1,model2,model3,model4):

  results_list=[]

  df=subset_MMLU_test()
  prompt_list=create_formatted_prompts(df)

  model1_responses=prompt_model_and_store_response(prompt_list,model1)
  time.sleep(5)
  model2_responses=prompt_model_and_store_response(prompt_list,model2)
  time.sleep(5)
  model3_responses=prompt_model_and_store_response(prompt_list,model3)
  time.sleep(5)

  judge_prompts_model1=create_formatted_prompts_judge(df,model1_responses)
  judge_prompts_model2=create_formatted_prompts_judge(df,model2_responses)
  judge_prompts_model3=create_formatted_prompts_judge(df,model3_responses)

  model1_judgements=judgements_of_judgeGPT(judge_prompts_model1,model4)
  time.sleep(5)
  model2_judgements=judgements_of_judgeGPT(judge_prompts_model2,model4)
  time.sleep(5)
  model3_judgements=judgements_of_judgeGPT(judge_prompts_model3,model4)
  time.sleep(5)

  model1_results=evaluate_results(model1_judgements)
  model2_results=evaluate_results(model2_judgements)
  model3_results=evaluate_results(model3_judgements)

  results = [model1_results, model2_results, model3_results]

  model1_results_list = [model1, results[0]]
  model2_results_list = [model2, results[1]]
  model3_results_list = [model3, results[2]]

  results_list.append(model1_results_list)
  results_list.append(model2_results_list)
  results_list.append(model3_results_list)

  return results_list


In [53]:
result_list=random_dataset_to_prompting_judging_and_evaluating(model1,model2,model3,model4)


0. y - 36 = 13
0. Incision and drainage
0. internal laryngeal nerve which is the afferent limb of the cough reflex.
2. (∀x)(Rx ⊃ Ax)
2. Add a separate paragraph to the accountant's report that discloses the departure from GAAP and its effects on the financial statements.
3. The importance of the issue to a particular individual or group
0. tea
1. HC2H3O2 and KC2H3O2
0. When a police officer has made a lawful custodial arrest of an individual, he may, as a contemporaneous incident of that arrest, search the person of the individual.
1. 0.656 mol/L
1. tends to be right, although this might be outweighed by other considerations.
2. the media manipulate 'the masses' as vulnerable, passive consumers
2. inadmissible as hearsay not within any recognized exception.
0. bed sheets
0. Article III of the Construction explicitly states that the Supreme Court's appellate jurisdiction is subject to such exceptions and regulations as Congress shall make.
2. not prevail, because the owner and the lands

In [55]:
print(result_list)

[['gpt-4-turbo', 18], ['gpt-3.5-turbo', 15], ['gpt-4o', 18]]


# Configuring the agent

In [57]:
@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, tools=None, tool_choice=None, model=model4):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            tools=tools,
            tool_choice=tool_choice,
        )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e

In [58]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "function": "magenta",
    }

    for message in messages:
        if message["role"] == "system":
            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "user":
            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and message.get("function_call"):
            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and not message.get("function_call"):
            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "function":
            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))


# Agent function calling

These are some examples. The function calling was done using the deprecated ```functions``` parameter. Changed later to use the current ```tools``` parameter instead.

A few such errors were coming up with the old parameter which I didn't understand, so I went ahead with the new `tools` parameter from here.

In [59]:
tools = [
    {
        "type": "function",
        "function": {
          "name": "subset_MMLU_test",
          "description": """Use this function to subset the mmlu""",
          "parameters": {
              "type": "object",
              "properties": {
                  "num": {
                      "type": "string",
                      "description": f"""
                              consists of the integer value to select the number of samples from the MMLU. Assume 20 if not mentioned.
                              """,
                  }
              },
              "required": ["num"],
          },
      },
    },

    {
        "type": "function",
        "function": {
          "name": "create_formatted_prompts",
          "description": """Use this function to format the dataset prompts.
          You should NEVER call this function before subset_MMLU_test has been called in the conversation.""",
          "parameters": {
              "type": "object",
              "properties": {
                  "dataframe": {
                      "type": "string",
                      "description": f"""
                              Dataframe containing the subset MMLU data. Call file 'MMLU_random_20' if name not mentioned.
                              """,
                  }
              },
              "required": ["dataframe"],
          },
      },
    },

    {
        "type": "function",
        "function": {
          "name": "prompt_model_and_store_response",
          "description": """Use this function to get the judgements from judgeGPT.
          You should NEVER call this function before subset_MMLU_test, and create_formatted_prompts has been called in the conversation.""",
          "parameters": {
              "type": "object",
              "properties": {
                  "model": {
                      "type": "string",
                      "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                      "description": f"""
                              Name of the gpt model to call. Infer this from the user's prompt.
                              """,
                  },
                  "prompt_list": {
                      "type": "string",
                      "description": f"""
                        The list of formatted prompts. Infer the name of the prompt_list from the user's prompt.
                      """
                  }
              },
              "required": ["model", "prompt_list"],
          },
      },
    },

    {
        "type": "function",
        "function": {
          "name": "create_formatted_prompts_judge",
        "description": """Use this function to format the dataset prompts for judge.
        You should NEVER call this function before subset_MMLU_test has been called in the conversation.""",
        "parameters": {
            "type": "object",
            "properties": {
                "dataframe": {
                    "type": "string",
                    "description": f"""
                            MMLU subset dataset. Infer this information from the user.
                            """,
                },
                "response_list": {
                    "type": "string",
                    "description": f"""
                            Responses given by the LLM which need to be evaluated.
                            """,
                }
            },
            "required": ["dataframe","response_list"],
          },
      },
    },

    {
        "type": "function",
        "function": {
          "name": "judgements_of_judgeGPT",
          "description": """Use this function to get the judgements from judgeGPT.
          You should NEVER call this function before subset_MMLU_test, create_formatted_prompts_judge, and create_formatted_prompts has been called in the conversation.""",
          "parameters": {
              "type": "object",
              "properties": {
                  "response_list": {
                      "type": "string",
                      "description": f"""
                              the response of the model that needs to be judged.
                              """,
                  },
                  "model": {
                      "type": "string",
                      "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                      "description": f"""
                              Name of the gpt model to call. Infer this from the user's prompt.
                              """,
                  },
              },
              "required": ["response_list"],
          },
      },
    },

    {
        "type": "function",
        "function": {
          "name": "evaluate_results",
        "description": """Use this function to score the judgements from judgeGPT.
        You should NEVER call this function before subset_MMLU_test, create_formatted_prompts_judge, create_formatted_prompts, and judgements_of_judgeGPT has been called in the conversation.""",
        "parameters": {
            "type": "object",
            "properties": {
                "judgement_list": {
                    "type": "string",
                    "description": f"""
                            This is the judgement list that needs to be scored. Infer this from the user.
                            """,
                }
            },
            "required": ["judgement_list"],
        },
      },
    },

    {
        "type": "function",
        "function": {
          "name": "random_dataset_to_prompting_judging_and_evaluating",
        "description": """Use this function to do everything from subsetting MMLU to prompting assistants to judging responses and giving the final result.
        You should NEVER call this function before subset_MMLU_test, create_formatted_prompts_judge, create_formatted_prompts, judgements_of_judgeGPT and evaluate_results has been called in the conversation.""",
        "parameters": {
            "type": "object",
            "properties": {
                "model1": {
                    "type": "string",
                    "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                    "description": f"""
                            1st model to be evaluated.
                            """,
                },
                "model1": {
                    "type": "string",
                    "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                    "description": f"""
                            2nd model to be evaluated.
                            """,
                },
                "model2": {
                    "type": "string",
                    "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                    "description": f"""
                            1st model to be evaluated.
                            """,
                },
                "model3": {
                    "type": "string",
                    "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                    "description": f"""
                            3rd model to be evaluated.
                            """,
                },
                "model4": {
                    "type": "string",
                    "enum": ["gpt-3.5-turbo", "gpt-4","gpt-4o","gpt-4-turbo"],
                    "description": f"""
                            The judge LLM.
                            """,
                }
            },
            "required": ["judgement_list"],
        },
      },
    },

]

## Examples of assistant understanding which function to call and their arguments

This was very interesting to me as I understood that agentic behaviour is possible to extract. I can make it work if I get access to more openAI credits.

In [None]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "Subset the MMLU dataset"})
chat_response = chat_completion_request(
    messages, tools=tools
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message


ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_fXhmifIJ3WIykAqSGd1r4ovy', function=Function(arguments='{"num":"20"}', name='subset_MMLU_test'), type='function')])

In [None]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "Format the dataframe to send to model. Hint: The dataframe argument is df"})
chat_response = chat_completion_request(
    messages, tools=tools
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_XJgJ87pGIgnWQEYwdhxyWwV1', function=Function(arguments='{"dataframe":"df"}', name='create_formatted_prompts'), type='function')])

In [None]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "prompt gpt-3.5-turbo on list 'updated_response' and store the response"})
chat_response = chat_completion_request(
    messages, tools=tools, tool_choice={"type": "function", "function": {"name": "prompt_model_and_store_response"}}
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_PeoEK0fAmSzuANb5uRAIBN9L', function=Function(arguments='{"model":"gpt-3.5-turbo","prompt_list":"updated_response"}', name='prompt_model_and_store_response'), type='function')])

In [None]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "Format the prompts of the judge. Use the dataframe df and 'gpt_4_responses' as the response_list "})
chat_response = chat_completion_request(
    messages, tools=tools,
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_hj4FpQqURLYpbOohDooN7gWF', function=Function(arguments='{"dataframe":"df","response_list":"gpt_4_responses"}', name='create_formatted_prompts_judge'), type='function')])

In [None]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "use gpt-4o to evaluate the responses present in 'list_of_responses'."})
chat_response = chat_completion_request(
    messages, tools=tools,
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_e4WiQW3piJMQ5FzlVaiUaUsc', function=Function(arguments='{"response_list":"list_of_responses","model":"gpt-4o"}', name='judgements_of_judgeGPT'), type='function')])

In [None]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "Score the judgements based on judgement_list 'score_gpt4'."})
chat_response = chat_completion_request(
    messages, tools=tools,
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0PcDvwO3O04FgjhpXeQuUmXF', function=Function(arguments='{"judgement_list":"score_gpt4"}', name='evaluate_results'), type='function')])

# Final_results

In [60]:
messages = []
messages.append({"role": "system", "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."})
messages.append({"role": "user", "content": "subset 20 questions and evaluate the performance of gpt-4, gpt-4-turbo, and gpt-3.5 using gpt-4o as the judge."})
chat_response = chat_completion_request(
    messages, tools=tools,
)
assistant_message = chat_response.choices[0].message
messages.append(assistant_message)
assistant_message

ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_NqDEQV7Alp0eLE3NSdU5t4K8', function=Function(arguments='{\n"model1": "gpt-4",\n"model2": "gpt-4-turbo",\n"model3": "gpt-3.5-turbo",\n"model4": "gpt-4o"\n}', name='random_dataset_to_prompting_judging_and_evaluating'), type='function')])

In [65]:
tool_calls = assistant_message.tool_calls
if tool_calls:
    # If true the model will return the name of the tool / function to call and the argument(s)
    tool_call_id = tool_calls[0].id
    tool_function_name = tool_calls[0].function.name
    tool_query_string = eval(tool_calls[0].function.arguments)

    # Step 3: Call the function and retrieve results. Append the results to the messages list.
    if tool_function_name == 'random_dataset_to_prompting_judging_and_evaluating':
        model1 = tool_query_string.get('model1')
        model2 = tool_query_string.get('model2')
        model3 = tool_query_string.get('model3')
        model4 = tool_query_string.get('model4')

        results = random_dataset_to_prompting_judging_and_evaluating(model1, model2, model3, model4)
        messages.append({
            'role': 'function',
            'name': tool_function_name,
            'content': str(results)
        })

1. By MMR vaccine
1. public issues
1. 3%
3. do nothing because the newspaper is self-authenticating.
3. the soul
3. Ownership of processed data and costs of data migrations.
1. Psychologists are prohibited from having sexual intimacies with former therapy clients for at least one year following the termination of therapy.
1. proportional
0. the general standard of living was high.
1. Wrong, Not wrong
3. Not wrong, Not wrong
3. Not wrong, Not wrong
3. A higher rate of inflation relative to other nations.
2. magic carpet
2. not prevail, because the carpenter was not engaged in the sale of the mirror.
3. Alcohol
3. Does the author use personal connections and stories to appeal to the audience's emotions?
1. Timing the pulsation with the radial pulse
0. Oration on the Dignity of Man
0. temporary dipoles created by the position of electrons around the nuclei in a molecule
1. By MMR vaccine
1. public issues
1. 3%
3. do nothing because the newspaper is self-authenticating.
3. the soul
3. Owne

In [66]:
print(results)

[['gpt-4', 15], ['gpt-4-turbo', 16], ['gpt-3.5-turbo', 11]]


In [63]:
print(eval(tool_calls[0].function.arguments))
{'model1': 'gpt-4', 'model2': 'gpt-4-turbo', 'model3': 'gpt-3.5-turbo', 'model4': 'gpt-4o'}

{'model1': 'gpt-4', 'model2': 'gpt-4-turbo', 'model3': 'gpt-3.5-turbo', 'model4': 'gpt-4o'}


### That's all that I could do in 7-8 hours. Thank you so much for seeing my project. I had a lot of fun.