In [1]:
import pandas as pd
import json
import altair as alt
import vl_convert as vlc
from collections import defaultdict
from pprint import pprint

### results (batched learning)

In [2]:
def get_test_acc(string):
    # the line is "Test accuracy of best hypothesis:  0.82\n"
    return float(string.split()[-1])

In [3]:
# no hypothesis baseline
baseline_dir = '/data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/no_hypothesis_inference'

# batched
batched_dir = '/data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/batched_inference'

In [7]:
# get the data for plots
results_directory = batched_dir
models = ['claude_2', 'Llama-2-7b-chat', 'turbo35_0613']
train_size = [1, 3, 6, 12, 25, 50, 100]
seeds = [49, 50, 51]
num_hypothesis = 5

# get data for plots
data = {}
for model in models:
    data[model] = defaultdict(list)

for model in models:
    for size in train_size:
        test_acc_list = []
        for seed in seeds:
            try:
                file_path = f"{results_directory}/inference_model_turbo35_0613_generation_model_{model}_train{size}_seed{seed}_hypothesis{num_hypothesis}.txt"
                
                # get the last three lines of the file
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                    last_lines = lines[-3:]

                # get accuracy numbers
                test_acc = get_test_acc(last_lines[0])

                # print('test_acc', test_acc)
                # print('max_train_acc', max_train_acc)
                # print('mean_train_acc', mean_train_acc)

                # add to lists
                test_acc_list.append(test_acc)

                # print('test_acc', test_acc)
            except:
                print('failed to read test acc: ', file_path)
                test_acc_list.append(0)
        
        # compute the average acc across seeds
        avg_test_acc = sum(test_acc_list) / len(test_acc_list)

        # add to data
        data[model]['avg_test_acc'].append((size, avg_test_acc))

print('claude 2: ', data['claude_2'])
print('Llama-2-7b-chat: ', data['Llama-2-7b-chat'])
print('turbo35 0613: ', data['turbo35_0613'])

failed to read test acc:  /data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/batched_inference/inference_model_turbo35_0613_generation_model_Llama-2-7b-chat_train100_seed49_hypothesis5.txt
failed to read test acc:  /data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/batched_inference/inference_model_turbo35_0613_generation_model_Llama-2-7b-chat_train100_seed50_hypothesis5.txt
failed to read test acc:  /data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/batched_inference/inference_model_turbo35_0613_generation_model_Llama-2-7b-chat_train100_seed51_hypothesis5.txt
failed to read test acc:  /data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/batched_inference/inference_model_turbo35_0613_generation_model_turbo35_0613_train50_seed49_hypothesis5.txt
failed to read test acc:  /data/rosa/work_in_progress/past-interaction-learning/print_log_tmp_debug/retweet/batched_inferenc

In [None]:
# # get no hypothesis baseline
# baseline_data = {}

# for model in models:
#     acc_list = []
#     for seed in seeds:
#         file_path = f"{baseline_dir}/inference_with_{model}_seed{seed}.txt"
#         with open(file_path, 'r') as f:
#             lines = f.readlines()
#             last_line = lines[-1]
#             test_acc = float(last_line.split()[-1])
#             acc_list.append(test_acc)
#             print('test_acc: ', test_acc)
#     avg_test_acc = sum(acc_list) / len(acc_list)
#     baseline_data[model] = avg_test_acc

# print('baseline_data: ', baseline_data)

In [9]:
# plot with altair
# plot claude results in dashed line and llama results in solid line
# plot test acc, max train acc, and mean train acc in different colors
# plot train size on x-axis

claude_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['claude_2']['avg_test_acc']],
    'type': 'avg_test_acc',
    'model': 'claude_2',
})

llama_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['Llama-2-7b-chat']['avg_test_acc']],
    'type': 'avg_test_acc',
    'model': 'Llama-2-7b-chat',
})

turbo_test_acc = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [t[1] for t in data['turbo35_0613']['avg_test_acc']],
    'type': 'avg_test_acc',
    'model': 'turbo35_0613',
})

roberta_baseline = pd.DataFrame({
    'training examples': train_size,
    'accuracy': [0.4533333333333333,
 0.5033333333333333,
 0.5033333333333333,
 0.52,
 0.5,
 0.5066666666666667,
 0.49],
    'type': 'finetuning',
    'model': 'roberta',
})

# Concatenate the data into a single DataFrame
plot_data = pd.concat([
        claude_test_acc,
        llama_test_acc,
        turbo_test_acc,
        roberta_baseline
        ], 
    ignore_index=True)

# Create Altair chart with color encoding
chart = alt.Chart(plot_data).mark_line().encode(
        x='training examples',
        y='accuracy',
        color=alt.Color('model:N', scale=alt.Scale(
            domain = ['claude_2', 'Llama-2-7b-chat', 'turbo35_0613', 'roberta'],
            range = ['#88CCEE', '#44AA99', '#CC6677', '#DDCC77']
        )),
        strokeDash=alt.StrokeDash('type:N', scale=alt.Scale(
            domain = ['avg_test_acc', 'finetuning'],
            range = [[0,0], [4, 4]]
        )),
    ).properties(
        title='Accuracy in Low Data Regime (Batched Learning, Shoe Dataset)',
    )

chart.configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

# Display the chart
chart

In [None]:
# # I want to add a data point for another model at (100, 0.913)
# dot_chart = alt.Chart(pd.DataFrame({'training examples': [100], 'accuracy': [0.913]})).mark_point(color='black').encode(
#     x='training examples',
#     y='accuracy',
#     color = '')

# dot_chart

### plot the test accuracy of supervised linear model, lin ucb, and ours on the shoe dataset (1k training examples)

In [None]:
# lin_ucb = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [26.4, 34.2, 39.2, 40.8],
#     'method': 'Lin UCB'
# })

# supervised = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [56.0, 62.0, 82.0, 97.0],
#     'method': 'Supervised Learning'
# })

# inductive_reasoning_greedy = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [76.1, 76.8, 73.4, 80.8],
#     'method': 'Old algorithm + 0613 (greedy)'
# })

# inductive_reasoning_relevant = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [77.4, 82.2, 67.0, 81.1],
#     'method': 'Old algorithm + 0613 (relevant)'
# })

# inductive_reasoning_bv = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [83.5, 83.2, 80.5, 82.8],
#     'method': 'Old algorithm + 0613 (binary vote)'
# })

# new_greedy = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [81.0, 71.0, 69.3, 56.3],
#     'method': 'New algorithm + 1106 (greedy)'
# })

# new_relevant = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [79.3, 69.3, 72.3, 70.7],
#     'method': 'New algorithm + 1106 (relevant)'
# })

# new_bv = pd.DataFrame({
#     'training examples': [100, 200, 500, 1000],
#     'accuracy on unseen examples (%)': [81.7, 81.7, 77.3, 80.0],
#     'method': 'New algorithm + 1106 (binary vote)'
# })

# # Concatenate the data into a single DataFrame
# data = pd.concat([
#                     # lin_ucb, 
#                     # supervised, 
#                     inductive_reasoning_greedy,
#                     inductive_reasoning_relevant,
#                     inductive_reasoning_bv,
#                     new_greedy,
#                     new_relevant,
#                     new_bv,
#                   ], 
#                   ignore_index=True)

# # Create Altair chart with color encoding
# chart = alt.Chart(data).mark_line().encode(
#         x='training examples',
#         y='accuracy on unseen examples (%)',
#         color='method'
#     ).properties(
#         title='Avg Acc over 3 Seeds vs Train Size',
#     )

# chart.configure_title(
#     fontSize=20,
#     font='Courier',
#     anchor='start',
#     color='gray'
# )

# # Display the chart
# chart
