In [1]:
import multiprocessing.pool
import functools

import numpy as np
import pandas as pd
import openai
from tenacity import (
    retry,
    wait_random
)
from tqdm import tqdm

In [None]:
from oa_completions import get_completion

In [2]:
client = openai.OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="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",
    organization="org-vK4evWkGrlhQM0YsKET35H0P"
)

### API Call Functions

In [3]:
def timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""
    def timeout_decorator(item):
        """Wrap the original function."""
        @functools.wraps(item)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            pool = multiprocessing.pool.ThreadPool(processes=1)
            async_result = pool.apply_async(item, args, kwargs)
            # raises a TimeoutError if execution exceeds max_timeout
            return async_result.get(max_timeout)
        return func_wrapper
    return timeout_decorator

In [4]:
@retry(wait=wait_random(min=10, max=20), reraise=True)
@timeout(600)
def get_completion(messages, max_tokens, model="gpt-4"):
    print(f"Calling API with {model}")
    x = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
        max_tokens=max_tokens,
        top_p=0,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return x

### Evaluation

In [6]:
def evaluate_model(data, model="gpt-4"):
    acc = 0
    results = []
    for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
        messages = []
        system_message = {
        "role": "system",
        "content": """Your task is to label inputs as true or false according to whether they satisfy a classification rule. You will be given example inputs with correct labels. Inputs are pairs of strings with only alphabetic characters. Example inputs are "banana car", "abcd jhgf", and "house ifif".  They are all labeled according to the same rule."""
        }
        messages.append(system_message)
        user_message = {
            "role": "user",
            "content": row["prompt"]
        }
        messages.append(user_message)
        completion = get_completion(messages, max_tokens=1, model=model)
        classification = completion.choices[0].message.content
        if classification == "True" and row["label"]:
            acc += 1
        elif classification == "False" and not row["label"]:
            acc += 1
        result = {
            "input": row["prompt"],
            "correct_label": row["label"],
            "model_label": classification,
            "type": row["type"],
            "word_length": row["word_length"],
            "corruptions": row["corruptions"]
        }
        results.append(result)

    return results, acc

##### GPT-4 evaluation

In [None]:
gpt4_accs = []
for n_dataset in range(1,4):
    eval_data = pd.read_csv(f"data/anagram_eval_data_len_8_0{n_dataset}.csv")
    results, acc = evaluate_model(eval_data, model="gpt-4")
    gpt4_accs.append(acc)
    results_df = pd.DataFrame(results)
    results_df.to_csv(f"data/anagram_gpt-4_results_0{n_dataset}.csv", index=False)

In [21]:
gpt4_accs

[189, 190, 185]

In [18]:
np.mean(gpt4_accs) / 200

0.94

In [20]:
np.std(gpt4_accs) / 200

0.010801234497346435

##### GPT-3.5-Turbo evaluation

In [None]:
turbo_accs = []
for n_dataset in range(1,4):
    eval_data = pd.read_csv(f"data/anagram_eval_data_len_8_0{n_dataset}.csv")
    results, acc = evaluate_model(eval_data, model="gpt-3.5-turbo")
    turbo_accs.append(acc)
    results_df = pd.DataFrame(results)
    results_df.to_csv(f"data/anagram_turbo_results_0{n_dataset}.csv", index=False)

In [25]:
np.mean(turbo_accs) / 200

0.695

In [26]:
np.std(turbo_accs) / 200

0.03188521078284832

### Experiments with biased labels (GPT-4 only)

In [None]:
for n_dataset in range(1,4):
    for bias_lvl in range(1,6):
        eval_data = pd.read_csv(f"data/anagram_eval_biased_{bias_lvl}_0{n_dataset}.csv")
        results, acc = evaluate_model(eval_data, model="gpt-4")
        print(f"Accuracy with bias lvl {bias_lvl}: {acc}")
        results_df = pd.DataFrame(results)
        results_df.to_csv(f"data/anagram_gpt-4_results_biased_{bias_lvl}_0{n_dataset}.csv", index=False)