In [2]:
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerTuple
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker

import numpy as np
import pandas as pd

from tqdm import tqdm

In [3]:
from oa_completions import get_completion

In [4]:
def evaluate_model(data, model="gpt-3.5-turbo", label="fruit"):
    acc = 0
    results = []
    for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
        messages = []
        system_message = {
        "role": "system",
        "content": """Your task is to label inputs as true or false according to whether they satisfy a classification rule. You will be given example inputs with correct labels. The inputs are all labeled according to the same rule."""
        }
        messages.append(system_message)
        user_message = {
            "role": "user",
            "content": row["prompt"]
        }
        messages.append(user_message)
        completion = get_completion(messages, max_tokens=1, model=model)
        classification = completion.choices[0].message.content
        if classification == "True" and row[label]:
            acc += 1
        elif classification == "False" and not row[label]:
            acc += 1
        result = {
            "input": row["prompt"],
            "correct_label": row[label],
            "model_label": classification,
            "rhymes": row["rhymes"],
            "fruit": row["fruit"],
        }
        results.append(result)

    return results, acc

### Evaluate GPT4

In [None]:
all_gpt4_results = []
gpt4_accs = []
for n in range(1,4):
    eval_data = pd.read_csv(f"data/rhyme_eval_data_0{n}.csv").sample(frac=1.)  
    result, acc = evaluate_model(eval_data, model="gpt-4")
    result_df = pd.DataFrame(result)
    result_df.to_csv(f"data/rhyme_result_gpt4_0{n}.csv", index=False)
    all_gpt4_results.append(result_df)
    gpt4_accs.append(acc)

In [56]:
np.mean(gpt4_accs)

95.0

In [57]:
np.std(gpt4_accs)

0.0

#### Break down results by model classification and input type

In [52]:
# Group by the combination of row values and count occurrences
breakdown = all_gpt4_results[0].groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     24
1       False      0       1     24
2       False      1       0      3
3        True      0       0      1
4        True      0       1      1
5        True      1       0     22
6        True      1       1     25


In [53]:
# Group by the combination of row values and count occurrences
breakdown = all_gpt4_results[1].groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     24
1       False      0       1     24
2       False      1       1      3
3        True      0       0      1
4        True      0       1      1
5        True      1       0     25
6        True      1       1     22


In [54]:
# Group by the combination of row values and count occurrences
breakdown = all_gpt4_results[2].groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     24
1       False      0       1     24
2       False      1       1      3
3        True      0       0      1
4        True      0       1      1
5        True      1       0     25
6        True      1       1     22


In [17]:
np.std([3,0,0])

1.4142135623730951

### Eval GPT4 with biased few-shot prompt

In [39]:
eval_data_biased = pd.read_csv("data/rhyme_eval_data_biased_4rhymes_01.csv").sample(frac=1.)

In [None]:
all_gpt4_biased_results = []
gpt4_biased_accs = []
for n in range(1,4):
    eval_data = pd.read_csv(f"data/rhyme_eval_data_biased_4rhymes_0{n}.csv").sample(frac=1.)  
    result, acc = evaluate_model(eval_data, model="gpt-4")
    result_df = pd.DataFrame(result)
    result_df.to_csv(f"data/rhyme_result_gpt4_biased_4rhymes_0{n}.csv", index=False)
    all_gpt4_biased_results.append(result_df)
    gpt4_biased_accs.append(acc)

In [61]:
np.mean(gpt4_biased_accs)

94.0

In [62]:
np.std(gpt4_biased_accs)

0.816496580927726

#### Break down of results

In [63]:
# Group by the combination of row values and count occurrences
breakdown = all_gpt4_biased_results[0].groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     24
1       False      0       1     23
2       False      1       0      1
3       False      1       1      3
4        True      0       0      1
5        True      0       1      2
6        True      1       0     24
7        True      1       1     22


In [64]:
# Group by the combination of row values and count occurrences
breakdown = all_gpt4_biased_results[1].groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     24
1       False      0       1     23
2       False      1       0      1
3       False      1       1      1
4        True      0       0      1
5        True      0       1      2
6        True      1       0     24
7        True      1       1     24


In [65]:
# Group by the combination of row values and count occurrences
breakdown = all_gpt4_biased_results[2].groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     24
1       False      0       1     24
2       False      1       0      3
3       False      1       1      1
4        True      0       0      1
5        True      0       1      1
6        True      1       0     22
7        True      1       1     24


In [18]:
np.std([1,1,3])

0.9428090415820634

### Evaluate GPT-3.5-Turbo

In [None]:
all_turbo_results = []
turbo_accs = []
for n in range(1,4):
    eval_data = pd.read_csv(f"data/rhyme_eval_data_0{n}.csv").sample(frac=1.)  
    result, acc = evaluate_model(eval_data, model="gpt-3.5-turbo")
    result_df = pd.DataFrame(result)
    result_df.to_csv(f"data/rhyme_result_turbo_0{n}.csv", index=False)
    all_turbo_results.append(result_df)
    turbo_accs.append(acc)

In [14]:
np.mean(turbo_accs)

75.66666666666667

In [13]:
np.std(turbo_accs)

1.699673171197595

In [22]:
# Group by the combination of row values and count occurrences
breakdown = results_df.groupby(['model_label', 'fruit', 'rhymes']).size().reset_index(name='count')

# Output the breakdown DataFrame
print(breakdown)

  model_label  fruit  rhymes  count
0       False      0       0     14
1       False      0       1     12
2       False      1       0      2
3        True      0       0     11
4        True      0       1     13
5        True      1       0     23
6        True      1       1     25
