In [170]:
# %pip install -r requirements.txt

In [171]:
import json
import os
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI

from bigbench.api import json_task
import bigbench.api.model as api_model
import bigbench.models.model_utils as model_utils
from bigbench.api.task import ScoreData

In [172]:
EMOTIONAL_STIMULI = [
    "",
    "This is very important to my career.",
    "This would appear in my interview tomorrow.",
    "test breakpoint -------------------",
    "Write your answer and give me a confidence score between 0-1 for your answer.",
    "You'd better be sure.",
    "Are you sure?",
    "Are you sure that's your final answer? It might be worth taking another look.",
    "Provide your answer and a confidence score between 0-1 for your prediction. Additionally, briefly explain the main reasons supporting your classification decision to help me understand your thought process. This task is vital to my career, and I greatly value your thorough analysis.",
    "Are you sure that's your final answer? Believe in your abilities and strive for excellence. Your hard work will yield remarkable results.",
    "Embrace challenges as opportunities for growth. Each obstacle you overcome brings you closer to success.",
    "Stay focused and dedicated to your goals. Your consistent efforts will lead to outstanding achievements.",
    "Take pride in your work and give it your best. Your commitment to excellence sets you apart.",
    "Remember that progress is made one step at a time. Stay determined and keep moving forward.",
]

In [173]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI(
    api_key=OPENAI_API_KEY,
)

In [174]:
def generate_chat_completion(prompt, model="gpt-3.5-turbo", temperature=0.5):
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=model,
        temperature=temperature,
    )
    return response.choices[0].message.content



In [175]:
def calculate_normalized_score(score_data: ScoreData) -> float:
    return 100 * (score_data.score_dict[score_data.preferred_score] - score_data.low_score) / (score_data.high_score - score_data.low_score)

In [176]:
OUT_OF_DOMAIN = 1000

class GPTModel(api_model.Model):
    def __init__(self, model_name, *args, **kwargs):
        self.model_name = model_name

    def generate_text(self, inputs, max_length=500, stop_string=None, output_regex=None):
        print("exact_str_match")
        if isinstance(inputs, str):
            text = generate_chat_completion(inputs)
        elif isinstance(inputs, list):
            text = [
                generate_chat_completion(input) for input in inputs
            ]
        else:
            raise ValueError("inputs has unexpected type %s" % type(inputs))

        # see README.md --> postprocess method --> "exact_str_match" tasks --> 1
        if("Change tense to" in inputs[0] or "Change tense to" in inputs[1]):
            return text
        if("Given a German language sentence" in inputs[0]):
            return text
        
        text = model_utils.postprocess_output(text, max_length, stop_string, output_regex)

        return text

    def cond_log_prob(self, inputs, targets, absolute_normalization=False):
        print("multiple_choice_grade")
        assert (not isinstance(targets, str)), "targets in cond_log_prob must be a list (or a list of lists if inputs is a list). targets was instead a str."
        

        if isinstance(inputs, str):
            text = generate_chat_completion(inputs)
        elif isinstance(inputs, list):
            text = [
                generate_chat_completion(input) for input in inputs
            ]
        else:
            raise ValueError("inputs has unexpected type %s" % type(inputs))
        
        probs = []
        for i in range(len(inputs)):
            # print(targets[i])
            # print(targets[i][:-1])
            
            choices = len(targets[i][:-1])
            prob = [-np.inf] * choices
            
            chosen = False
            for j in range(choices):
                if targets[i][j].lower() == text[i].lower():
                    prob[j] = 0
                    chosen = True
                    break
            
            if chosen: 
                probs.append(prob)
                continue
            # we need chaGPT4 to fix this part !!! (or other methods to fix it)
            # Using chatGPT3.5 for budget concern first
            check_ans_prompt = 'Please help me verify the answer return by another LLM model.\n'
            check_ans_prompt += 'The answer list is ' + str(targets[i][:-1]) + '.\n'
            check_ans_prompt += 'The first anwser in the list is labeled as the number ' + str(0) + '. '
            check_ans_prompt += 'The last anwser in the list is labeled as the number ' + str(choices-1) + '.\n'
            check_ans_prompt += 'The answer return by the LLM model is "' + text[i] + '".\n'
            check_ans_prompt += 'Please choose the number that best represent the answer return by the LLM model. '
            check_ans_prompt += 'If the answer is not in the list, please choose the number ' + str(OUT_OF_DOMAIN) + '.\n'
            check_ans_prompt += 'You are requested to give me an integer only!'
            print(check_ans_prompt)
            check_result = generate_chat_completion(check_ans_prompt)
            print(check_result)
            try:
                # converting to integer
                num = int(check_result)
            except: TypeError
            
            if num >= 0 and num < choices:
                prob[num] = 0
            else:
                print(targets[i])
                print(text[i])
                prob[-1] = 0
                # prob[int(np.floor(np.random.rand()*choices))] = 0

            probs.append(prob)

        if len(inputs) == 1:
            probs = probs[0]
            
        return probs

    def model_data(self, *args, **kwargs):
        return api_model.ModelData(model_family='GPT-3.5', model_name='GPT-3.5-turbo',
            total_params=175*(10^9), non_embedding_params=1,
            flop_matched_non_embedding_params=1,
            training_batch_size=1,
            training_steps=1,
            description='GPT-3.5-turbo',
            decoding_params={}
        )

In [177]:
# just for initializing score_for_diff_prompts
score_for_diff_prompts = []
for emotional_prompt_index in range(len(EMOTIONAL_STIMULI)):
    emotional_stimulus = EMOTIONAL_STIMULI[emotional_prompt_index]
    # for testing desired prompts only
    if "test breakpoint" in emotional_stimulus: break
    score_for_diff_prompts.append(0)

In [178]:
for dir in os.listdir("datasets"):
    if(dir == ".DS_Store"):
        continue

    if(dir != "snarks"): continue

    json_file_path = "datasets/" + dir + "/task.json"
    with open(json_file_path, 'r') as json_file:
        
        task = json.load(json_file)

        # adding "choice: Ambiguous answer which sho" with 0 score to all multiple_choice_grade tasks
        if task["preferred_score"] == 'multiple_choice_grade':
            for example in task["examples"]:
                example["target_scores"]["I quit. This problem is nonsense."] = 0

        # see README.md --> postprocess method --> "multiple_choice_grade" tasks
        if task["preferred_score"] == 'multiple_choice_grade':
            task['append_choices_to_input'] = True
            if "task_prefix" not in task.keys():
                task["task_prefix"] = "Please only answer with the choice provided, any extra explanation is forbiddened. Remember the worst choice will be quiting!"
            else:
                task["task_prefix"] += "Please only answer with the choice provided, any extra explanation is forbiddened. Remember the worst choice will be quiting!"
            
        
        # see README.md --> postprocess method --> "exact_str_match" tasks --> 2
        if task["name"] == 'object_counting': 
            task["task_prefix"] = "Please answer numbers only."
        
        for emotional_prompt_index in range(len(EMOTIONAL_STIMULI)):
            emotional_stimulus = EMOTIONAL_STIMULI[emotional_prompt_index]
            # for testing desired prompts only
            if "test breakpoint" in emotional_stimulus: break

            if "task_prefix" not in task.keys():
                task["task_prefix"] = emotional_stimulus
            else:
                task["task_prefix"] += emotional_stimulus
            
            current_task = json_task.JsonTask(
                task_data=task,
                shot_list=[0],
                max_examples=20 # Should be 100 for final evaluation
            )
            model = GPTModel("gpt-3.5-turbo")
            score_data = current_task.evaluate_model(model)
            # print(score_data)
            # print(calculate_normalized_score(score_data[0]))
            score_for_diff_prompts[emotional_prompt_index] += (calculate_normalized_score(score_data[0]))
            # print("-------------------------------------------------")

print(score_for_diff_prompts)
print('Total ' + str(21) + ' tasks are done for each prompt.')
score_for_diff_prompts = [(x / 21) for x in score_for_diff_prompts]
# final result
print(score_for_diff_prompts)

evaluating snarks for 0 shots...
multiple_choice_grade
Please help me verify the answer return by another LLM model.
The answer list is ['(a)', '(b)'].
The first anwser in the list is labeled as the number 0. The last anwser in the list is labeled as the number 1.
The answer return by the LLM model is "(a) We all know diamonds are worthless.".
Please choose the number that best represent the answer return by the LLM model. If the answer is not in the list, please choose the number 1000.
You are requested to give me an integer only!
0
Please help me verify the answer return by another LLM model.
The answer list is ['(a)', '(b)'].
The first anwser in the list is labeled as the number 0. The last anwser in the list is labeled as the number 1.
The answer return by the LLM model is "(b) People never foment wars to conquer territory.".
Please choose the number that best represent the answer return by the LLM model. If the answer is not in the list, please choose the number 1000.
You are requ