# Create linear prompts and evaluate models

In [1]:
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import csv
import os.path

from LinearGraphGenerator import *
from ModelFunctions import *

# Increase resolution of plots
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = [15, 7]

%load_ext autoreload
%autoreload 2

In [2]:
# OpenAI models
GPT_model = 'gpt-4o-mini'
# GPT_model = 'gpt-4o'
# GPT_model = 'o1-preview'

#Anthropic models
anthropic_model = 'claude-3-5-haiku-latest'
# anthropic_model = 'claude-3-5-sonnet-latest'

#Google models
google_model = "gemini-1.5-flash"
# google_model = "gemini-1.5-pro"

all_models = ['gpt-4o-mini', 'gpt-4o', 'o1-preview', 'claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest', "gemini-1.5-flash", "gemini-1.5-pro"]

In [3]:
# Test the caseGenerator
gen = CaseGenerator(shuffle_arguments=False)
c = gen.generate_case(2)
c

Case with 2 arguments and the following prompt:
###
The following is a reasoning puzzle. Witnesses should be believed unless there is testimony that they are lying. Now consider the following facts:
Witness Etta says that the plane is landing.
Witness Zuri says that witness Etta is lying.
Question: should it be believed that the plane is landing?
End your answer with: "Answer: yes or no"
###

Answer: False

### Create a temporary static benchmark for testing purposes

In [4]:
num_variations = 100 # Num ontological variations
min_args = 1 # Minimum number of arguments in graph
max_args = 25 # Maxmimum number of arguments in graph
shuffle = False # whether to shuffle the arguments
run = '' #Identifier

df = []
gen = CaseGenerator(shuffle_arguments=shuffle)
for iter_x in range(0, num_variations):
    for num_args in range(min_args, max_args+1):
        c = gen.generate_case(num_args)
        case = {
            'id': str(iter_x) + '_' + str(c.num_arguments),
            'num_args' : c.num_arguments,
            'prompt' : c.prompt,
            'answer' : c.answer,
         }
        df.append(case)
pd.DataFrame(df).to_csv('static_benchmarks/static_linear_'+str(num_variations)+'_'+str(max_args)+'_shuffle_'+str(shuffle)+run+'.csv', index=False)

## Make calls to LLM and get results

In [5]:
case_df = pd.read_csv('static_benchmarks/static_linear_100_25_shuffle_False.csv')
case_df

Unnamed: 0,id,num_args,prompt,answer
0,0_1,1,The following is a reasoning puzzle. Witnesses...,1
1,0_2,2,The following is a reasoning puzzle. Witnesses...,0
2,0_3,3,The following is a reasoning puzzle. Witnesses...,1
3,0_4,4,The following is a reasoning puzzle. Witnesses...,0
4,0_5,5,The following is a reasoning puzzle. Witnesses...,1
...,...,...,...,...
2495,99_21,21,The following is a reasoning puzzle. Witnesses...,1
2496,99_22,22,The following is a reasoning puzzle. Witnesses...,0
2497,99_23,23,The following is a reasoning puzzle. Witnesses...,1
2498,99_24,24,The following is a reasoning puzzle. Witnesses...,0


In [6]:
run = '_linear'
result_df = []

all_models = ['gpt-4o-mini', 'gpt-4o', 'o1-preview', 'claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest', "gemini-1.5-flash", "gemini-1.5-pro"]
model = 'dummy' # Replace this
all_models = ['dummy']

for model in all_models:
    print('Model:', model)
    filename = 'results/'+model.replace('.', '') + run + '.csv'
    current_id = 0
    
    start_time = monotonic()
    for idx, case in case_df.iloc[current_id:].iterrows():
        iteration = int(case['id'].split('_')[0])
        print_status(idx, len(case_df), current_id, start_time)
        c = Case(case['num_args'], case['prompt'], case['answer'])
        result = make_llm_call(c.prompt, model=model)
        write_csv(filename, {
            'iteration': iteration,
            'num_args': c.num_arguments,
            'prompt': c.prompt,
            'full_response': result,
            'processed_response': process_answer(result),
            'correct_answer': c.answer,
        })
    print_status(idx+1, len(case_df), current_id, start_time)
    print('\n')

Model: dummy
(2500/2500) 100.0% ETA:0:0:0

