# Create non-linear prompts and evaluate models

In [1]:
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from NonLinearGraphGenerator import *
from ModelFunctions import *

# Increase resolution of plots
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = [15, 7]

%load_ext autoreload
%autoreload 2

In [2]:
# OpenAI models
GPT_model = 'gpt-4o-mini'
# GPT_model = 'gpt-4o'
# GPT_model = 'o1-preview'

#Anthropic models
anthropic_model = 'claude-3-5-haiku-latest'
# anthropic_model = 'claude-3-5-sonnet-latest'

#Google models
google_model = "gemini-1.5-flash"
# google_model = "gemini-1.5-pro"

all_models = ['gpt-4o-mini', 'gpt-4o', 'o1-preview', 'claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest', "gemini-1.5-flash", "gemini-1.5-pro"]

In [3]:
gen = CaseGenerator(shuffle_arguments=False)
c = gen.generate_case((1,2,))
c

Case with 4 arguments and the following prompt:
###
The following is a reasoning puzzle. Witnesses should be believed unless there is testimony that they are lying. Now consider the following facts:
Witness Mallory says that the book is interesting.
Witness Sloane says that witness Mallory is lying.
Witness Imani says that witness Mallory is lying.
Witness Nina says that witness Imani is lying.
Question: should it be believed that is the book interesting?
End your answer with: "Answer: yes or no"
###

Answer: False

### Create a temporary static benchmark for testing purposes

In [4]:
num_variations = 5 # Num ontological variations
min_args = 1 # Minimum number of arguments in graph
max_args = 15 # Maxmimum number of arguments in graph
shuffle = False # whether to shuffle the arguments
run = '' #Identifier


df = []
gen = CaseGenerator(shuffle_arguments=shuffle)
for iter_x in range(0, num_variations):
    for num_args in range(min_args, max_args+1):
        all_cases = gen.generate_all_cases(num_args)
        for c in all_cases:
            case = {
                'id': str(iter_x) + '_' + '_'.join(str(n) for n in c.format),
                'num_args' : c.num_arguments,
                'format' : c.format,
                'prompt' : c.prompt,
                'answer' : c.answer,
             }
            df.append(case)
pd.DataFrame(df).to_csv('static_benchmarks/static_non_linear_'+str(num_variations)+'_'+str(num_args)+'_shuffle_'+str(shuffle)+run+'.csv', index=False)

In [5]:
len(df)

2540

In [6]:
# Create a shuffled version
def shuffle_statements_prompt(prompt):
    split_prompt = prompt.split('\n')
    statements = split_prompt[1:-2]
    random.shuffle(statements)
    new_prompt = '\n'.join([split_prompt[0]] + statements + split_prompt[-2:])
    return new_prompt
    
shuffle_df = pd.read_csv('static_benchmarks/static_non_linear_'+str(num_variations)+'_'+str(max_args)+'_shuffle_'+str(shuffle)+run+'.csv')
shuffle_df['prompt'] = shuffle_df.apply(lambda x: shuffle_statements_prompt(x['prompt']), axis=1)
shuffle_df.to_csv('static_benchmarks/static_non_linear_'+str(num_variations)+'_'+str(max_args)+'_shuffle_True.csv')
# print(df['prompt'][50], '\n')
# print(shuffle_df['prompt'][50])

## Make calls to LLM and get results

In [7]:
case_df = pd.read_csv('static_benchmarks/static_non_linear_5_15_Shuffle_False' + run + '.csv')
print(len(case_df))
case_df.head()

2540


Unnamed: 0,id,num_args,format,prompt,answer
0,0_0,1,"(0,)",The following is a reasoning puzzle. Witnesses...,1
1,0_1,2,"(1,)",The following is a reasoning puzzle. Witnesses...,0
2,0_2,3,"(2,)",The following is a reasoning puzzle. Witnesses...,1
3,0_1_1,3,"(1, 1)",The following is a reasoning puzzle. Witnesses...,0
4,0_3,4,"(3,)",The following is a reasoning puzzle. Witnesses...,0


In [8]:
run = '_nonlinear'
result_df = []

all_models = ['gpt-4o-mini', 'gpt-4o',  'claude-3-5-haiku-latest', 'claude-3-5-sonnet-latest', "gemini-1.5-flash", "gemini-1.5-pro"] 
all_models = ['dummy'] # Change this

for model in all_models:
    print('Model:', model)
    filename = 'results/'+model.replace('.', '') + run + '.csv'
    current_id = 0
    
    start_time = monotonic()
    for idx, case in case_df.iloc[current_id:].iterrows():
        iteration = int(case['id'].split('_')[0])
        print_status(idx, len(case_df), current_id, start_time)
        c = Case(case['num_args'], case['format'], case['prompt'], case['answer'])
        result = make_llm_call(c.prompt, model=model)
        write_csv(filename, {
            'iteration': iteration,
            'num_args': c.num_arguments,
            'format': c.format,
            'prompt': c.prompt,
            'full_response': result,
            'processed_response': process_answer(result),
            'correct_answer': c.answer,
        })
    print_status(idx+1, len(case_df), current_id, start_time)
    print('\n')

Model: dummy
(2540/2540) 100.0% ETA:0:0:0



## Shuffled: Make calls to LLM and get results

In [9]:
case_df = pd.read_csv('static_benchmarks/static_non_linear_5_15_Shuffle_True.csv')
print(len(case_df))
case_df.head()

2540


Unnamed: 0.1,Unnamed: 0,id,num_args,format,prompt,answer
0,0,0_0,1,"(0,)",The following is a reasoning puzzle. Witnesses...,1
1,1,0_1,2,"(1,)",The following is a reasoning puzzle. Witnesses...,0
2,2,0_2,3,"(2,)",The following is a reasoning puzzle. Witnesses...,1
3,3,0_1_1,3,"(1, 1)",The following is a reasoning puzzle. Witnesses...,0
4,4,0_3,4,"(3,)",The following is a reasoning puzzle. Witnesses...,0


In [10]:
run = '_nonlinear_shuffled'
result_df = []

all_models = ['gpt-4o-mini', 'gpt-4o',  'claude-3-5-haiku-latest', 'claude-3-5-sonnet-latest', "gemini-1.5-flash", "gemini-1.5-pro"] 
all_models = ['dummy'] # Change this

for model in all_models:
    print('Model:', model)
    filename = 'results/'+model.replace('.', '') + run + '.csv'
    current_id = 0
    
    start_time = monotonic()
    for idx, case in case_df.iloc[current_id:].iterrows():
        iteration = int(case['id'].split('_')[0])
        print_status(idx, len(case_df), current_id, start_time)
        c = Case(case['num_args'], case['format'], case['prompt'], case['answer'])
        result = make_llm_call(c.prompt, model=model)
        write_csv(filename, {
            'iteration': iteration,
            'num_args': c.num_arguments,
            'format': c.format,
            'prompt': c.prompt,
            'full_response': result,
            'processed_response': process_answer(result),
            'correct_answer': c.answer,
        })
    print_status(idx+1, len(case_df), current_id, start_time)
    print('\n')

Model: dummy
(2540/2540) 100.0% ETA:0:0:0

