In [1]:
import pandas as pd
import json
import altair as alt
import vl_convert as vlc
from collections import defaultdict
from pprint import pprint

In [2]:
def get_test_acc(string):
    # the line is "Test accuracy of best hypothesis:  0.82\n"
    return float(string.split()[-1])

def get_max_train_acc(string):
    # the line is "Training accuracy of best hypothesis:  0.96\n"
    return float(string.split()[-1])

def get_train_acc_list(string, num_hypothesis):
    # the line is "Training accuracies:  [0.64, 0.84, 0.76, 0.48, 0.96]\n"
    float_prime = string.split()[-num_hypothesis:]
    res = []
    for f in float_prime:
        # remove the comma
        # remove the bracket
        res.append(float(f.replace(',', '').replace('[', '').replace(']', '')))
    return res

In [3]:
# get the data for plots
results_directory = "/data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/binary_original_sst/select_and_evaluate_best_hypothesis/"
models = ['claude_2', 'llama_2_7b', 'turbo35_0613']
train_size = [1, 3, 6, 12, 25, 50, 100]
seeds = [49, 50, 51]
num_hypothesis = 5

# get data for plots
data = {}
for model in models:
    data[model] = defaultdict(list)

for model in models:
    for size in train_size:
        test_acc_list = []
        max_train_acc_list = []
        mean_train_acc_list = []
        try:
            for seed in seeds:
                file_path = f"{results_directory}{model}_train{size}_seed{seed}_hypothesis{num_hypothesis}.txt"
                
                # get the last three lines of the file
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                    last_lines = lines[-3:]

                # get accuracy numbers
                test_acc = get_test_acc(last_lines[0])
                max_train_acc = get_max_train_acc(last_lines[1])
                train_acc_list = get_train_acc_list(last_lines[2], num_hypothesis)
                mean_train_acc = sum(train_acc_list) / len(train_acc_list)

                # print('test_acc', test_acc)
                # print('max_train_acc', max_train_acc)
                # print('mean_train_acc', mean_train_acc)

                # add to lists
                test_acc_list.append(test_acc)
                max_train_acc_list.append(max_train_acc)
                mean_train_acc_list.append(mean_train_acc)
            
            # compute the average acc across seeds
            avg_test_acc = sum(test_acc_list) / len(test_acc_list)
            avg_max_train_acc = sum(max_train_acc_list) / len(max_train_acc_list)
            avg_mean_train_acc = sum(mean_train_acc_list) / len(mean_train_acc_list)

            # add to data
            data[model]['avg_test_acc'].append((size, avg_test_acc))
            data[model]['avg_max_train_acc'].append((size, avg_max_train_acc))
            data[model]['avg_mean_train_acc'].append((size, avg_mean_train_acc))

            # if model == 'turbo35_0613':
            #     print(avg_test_acc)

        except:
            data[model]['avg_test_acc'].append((size, 0))
            data[model]['avg_max_train_acc'].append((size, 0))
            data[model]['avg_mean_train_acc'].append((size, 0))

# print('data: ', data['turbo35_0613'])

In [4]:
# plot with altair
# plot claude results in dashed line and llama results in solid line
# plot test acc, max train acc, and mean train acc in different colors
# plot train size on x-axis

claude_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['claude_2']['avg_test_acc']],
    'type': 'avg_test_acc',
    'model': 'claude_2',
})

llama_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['llama_2_7b']['avg_test_acc']],
    'type': 'avg_test_acc',
    'model': 'llama_2_7b',
})

turbo_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['turbo35_0613']['avg_test_acc']],
    'type': 'avg_test_acc',
    'model': 'turbo35_0613',
})

# Concatenate the data into a single DataFrame
plot_data = pd.concat([
                    llama_test_acc,
                    claude_test_acc,
                    turbo_test_acc
                  ], 
                  ignore_index=True)

# Create Altair chart with color encoding
chart = alt.Chart(plot_data).mark_line().encode(
        x='training examples',
        y='accuracy',
        color=alt.Color('model:N', scale=alt.Scale(
            domain=['claude_2', 'llama_2_7b', 'turbo35_0613'],
            range=['#648FFF', '#DC267F', '#FFB000']
        )),
    ).properties(
        title='Accuracy in Low Data Regime (Batched Learning, SST-2)',
    )

chart.configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

# Display the chart
chart

### compare different hypothesis at inference time (inference model=chatgpt)

In [5]:
chatgpt_hypothesis_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['turbo35_0613']['avg_test_acc']],
    'hypothesis': 'turbo35_0613',
})

In [11]:
# get no hypothesis test acc
results_folder = "/data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/binary_original_sst/no_hypothesis_zero_shot/"
test_acc = []
for seed in seeds:
    with open(f"{results_folder}/inference_with_turbo35_0613_seed{seed}.txt", 'r') as f:
        lines = f.readlines()
        test_acc.append(get_test_acc(lines[-1]))
average_test_acc = sum(test_acc) / len(test_acc)
print('no hypothesis test acc: ', average_test_acc)

no_hypothesis_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [average_test_acc for size in train_size],
    'hypothesis': 'none',
})    

no hypothesis test acc:  0.8866666666666667


In [12]:
# get claude hypothesis test acc
results_folder = "/data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/binary_original_sst/select_and_evaluate_best_hypothesis/inference_with_turbo35_0613"
average_test_acc = []
model = 'claude_2'
num_hypothesis=5
for size in train_size:
    test_acc = []
    for seed in seeds:
        with open(f"{results_folder}/{model}_train{size}_seed{seed}_hypothesis{num_hypothesis}.txt", 'r') as f:
            lines = f.readlines()
            test_acc.append(get_test_acc(lines[-3]))

    average_test_acc.append((size, sum(test_acc) / len(test_acc)))

claude_hypothesis_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in average_test_acc],
    'hypothesis': 'claude_2',
})

In [13]:
print('chatgpt_hypothesis_test_acc: ', chatgpt_hypothesis_test_acc)
print('no_hypothesis_test_acc: ', no_hypothesis_test_acc)
print('claude_hypothesis_test_acc: ', claude_hypothesis_test_acc)

chatgpt_hypothesis_test_acc:     training examples  accuracy    hypothesis
0                  1  0.813333  turbo35_0613
1                  3  0.863333  turbo35_0613
2                  6  0.886667  turbo35_0613
3                 12  0.840000  turbo35_0613
4                 25  0.903333  turbo35_0613
5                 50  0.863333  turbo35_0613
6                100  0.893333  turbo35_0613
no_hypothesis_test_acc:     training examples  accuracy hypothesis
0                  1  0.886667       none
1                  3  0.886667       none
2                  6  0.886667       none
3                 12  0.886667       none
4                 25  0.886667       none
5                 50  0.886667       none
6                100  0.886667       none
claude_hypothesis_test_acc:     training examples  accuracy hypothesis
0                  1  0.810000   claude_2
1                  3  0.900000   claude_2
2                  6  0.903333   claude_2
3                 12  0.886667   claude_2
4         

In [21]:
# make the plot
plot_data = pd.concat([
                    chatgpt_hypothesis_test_acc,
                    no_hypothesis_test_acc,
                    claude_hypothesis_test_acc
                  ], 
                  ignore_index=True)

# Create Altair chart with color encoding
chart = alt.Chart(plot_data).mark_line().encode(
        x='training examples',
        y='accuracy',
        color=alt.Color('hypothesis:N', scale=alt.Scale(
            domain=['turbo35_0613', 'none', 'claude_2'],
            range=['#648FFF', '#DC267F', '#FFB000']
        )),
    ).properties(
        title='Batched Learning, SST-2, Chatgpt Inference with Different Hypotheses',
    )

chart.configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

# Display the chart
chart

In [20]:
# scale the plot to show only the relevant part
# Create Altair chart with color encoding
chart = alt.Chart(plot_data).mark_line().encode(
        x=alt.X('training examples', scale=alt.Scale(domain=[1, 100])),
        y=alt.Y('accuracy', scale=alt.Scale(domain=[0.78, 0.94])),
        color=alt.Color('hypothesis:N', scale=alt.Scale(
            domain=['turbo35_0613', 'none', 'claude_2'],
            range=['#648FFF', '#DC267F', '#FFB000']
        )),
    ).properties(
        title='Batched Learning, SST-2, Chatgpt Inference with Different Hypotheses',
    )

chart.configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

# Display the chart
chart