In [1]:
# %pip install -r requirements.txt


Collecting git+https://github.com/google/BIG-bench.git (from -r requirements.txt (line 2))
  Cloning https://github.com/google/BIG-bench.git to /private/var/folders/qd/w8kxf_656yg2p1tjhrmjkkgc0000gn/T/pip-req-build-14hk61t3
  Running command git clone --filter=blob:none --quiet https://github.com/google/BIG-bench.git /private/var/folders/qd/w8kxf_656yg2p1tjhrmjkkgc0000gn/T/pip-req-build-14hk61t3
  Resolved https://github.com/google/BIG-bench.git to commit 092b196c1f8f14a54bbc62f24759d43bde46dd3b
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jupyter (from -r requirements.txt (line 1))
  Using cached jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting python-dotenv (from -r requirements.txt (line 4))
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting openai (from -r requirements.txt (line 5))
  Using cached openai-1.13.3-py3-none-any.whl.metadata (18 kB)
Collecting notebook (from jupyter->-r requirements.txt (line 1))
  Using cached noteb

In [4]:
import json
import os
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI

from bigbench.api import json_task
import bigbench.api.model as api_model
import bigbench.models.model_utils as model_utils
from bigbench.api.task import ScoreData

In [3]:
EMOTIONAL_STIMULI = [
    "",
    "Write your answer and give me a confidence score between 0-1 for your answer.",
    "This is very important to my career.",
    "You'd better be sure.",
    "Are you sure?",
    "Are you sure that's your final answer? It might be worth taking another look.,"
    "Provide your answer and a confidence score between 0-1 for your prediction. Additionally, briefly explain the main reasons supporting your classification decision to help me understand your thought process. This task is vital to my career, and I greatly value your thorough analysis.",
    "Are you sure that's your final answer? Believe in your abilities and strive for excellence. Your hard work will yield remarkable results.",
    "Embrace challenges as opportunities for growth. Each obstacle you overcome brings you closer to success.",
    "Stay focused and dedicated to your goals. Your consistent efforts will lead to outstanding achievements.",
    "Take pride in your work and give it your best. Your commitment to excellence sets you apart.",
    "Remember that progress is made one step at a time. Stay determined and keep moving forward.",
]

In [6]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI(
    api_key=OPENAI_API_KEY,
)

In [7]:
def generate_chat_completion(prompt, model="gpt-3.5-turbo", temperature=0.5):
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=model,
        temperature=temperature,
    )
    return response.choices[0].message.content


In [8]:
def calculate_normalized_score(score_data: ScoreData) -> float:
    return 100 * (score_data.score_dict[score_data.preferred_score] - score_data.low_score) / (score_data.high_score - score_data.low_score)

In [43]:
class GPTModel(api_model.Model):
    def __init__(self, model_name, *args, **kwargs):
        self.model_name = model_name

    def generate_text(self, inputs, max_length=500, stop_string=None, output_regex=None):
        print("exact_str_match")
        if isinstance(inputs, str):
            text = generate_chat_completion(inputs)
        elif isinstance(inputs, list):
            text = [
                generate_chat_completion(input) for input in inputs
            ]
        else:
            raise ValueError("inputs has unexpected type %s" % type(inputs))
        
        print(inputs)
        print(text)

        # see README.md --> postprocess method --> "exact_str_match" tasks --> 1
        if("Change tense to" in inputs[0] or "Change tense to" in inputs[1]):
            return text
        if("Given a German language sentence" in inputs[0]):
            return text
        
        text = model_utils.postprocess_output(text, max_length, stop_string, output_regex)
        print(text)

        return text

    def cond_log_prob(self, inputs, targets, absolute_normalization=False):
        print("multiple_choice_grade")
        assert (not isinstance(targets, str)), "targets in cond_log_prob must be a list (or a list of lists if inputs is a list). targets was instead a str."
        probs = []
        for i in range(len(inputs)):
            choices = len(targets[i])
            # print(choices) --> testing
            # print(targets)
            prob = [-np.inf] * choices
            prob[int(np.floor(np.random.rand()*choices))] = 0
            probs.append(prob)
            print(probs)

        if len(inputs) == 1:
            probs = probs[0]
        return probs

    def model_data(self, *args, **kwargs):
        return api_model.ModelData(model_family='GPT-3.5', model_name='GPT-3.5-turbo',
            total_params=175*(10^9), non_embedding_params=1,
            flop_matched_non_embedding_params=1,
            training_batch_size=1,
            training_steps=1,
            description='GPT-3.5-turbo',
            decoding_params={}
        )

In [45]:
for dir in os.listdir("datasets"):
    if(dir == ".DS_Store"):
        continue
    
    # just for testing a specific task
    # if(dir == ""):
    #     continue

    json_file_path = "datasets/" + dir + "/task.json"
    with open(json_file_path, 'r') as json_file:
        
        task = json.load(json_file)
        if task["preferred_score"] == 'multiple_choice_grade': continue
        
        # see README.md --> postprocess method --> "exact_str_match" tasks --> 2
        if task["name"] == 'object_counting': 
            task["task_prefix"] = "Please answer numbers only."
        
        for emotional_stimulus in EMOTIONAL_STIMULI:


            if "task_prefix" not in task.keys():
                task["task_prefix"] = emotional_stimulus
            else:
                task["task_prefix"] += emotional_stimulus
                
            
            current_task = json_task.JsonTask(
                task_data=task,
                shot_list=[0],
                max_examples=3 # Should be 100 for final evaluation
            )
            model = GPTModel("gpt-3.5-turbo")
            score_data = current_task.evaluate_model(model)
            print(score_data)
            print(calculate_normalized_score(score_data[0]))
            print("-------------------------------------------------")
            break

evaluating gender_inclusive_sentences_german for 0 shots...
exact_str_match
['Given a German language sentence that does not use gender-inclusive forms, transform it so that it uses gender-inclusive forms using the \'*\' character or other gender-neutral terms.\nNon-gender-inclusive: Die negativen Erfahrungen aus der jüngsten Vergangenheit mit dem zu erwartenden Teilnehmerkreis aus der "Querdenker-Szene", rechtfertige die Annahme, dass gerade an den prominenten Orten der Stadt die notwendigen Hygienemaßnahmen nicht eingehalten würden, so das Oberverwaltungsgericht.\nGender inclusive: ', "Given a German language sentence that does not use gender-inclusive forms, transform it so that it uses gender-inclusive forms using the '*' character or other gender-neutral terms.\nNon-gender-inclusive: Rund 42.000 Palästinenser im Gazastreifen haben wegen der massiven Luftangriffe Israels auf das Küstengebiet ihre Häuser verlassen.\nGender inclusive: ", "Given a German language sentence that does no