In [1]:
import pandas as pd
import numpy as np 

from SelfRefine import SelfRefine
import random
from tqdm import tqdm


In [2]:
gpqa = pd.read_csv("gpqa_main.csv")

In [3]:
gpqa_QA = gpqa[[ 'Question', 'Pre-Revision Incorrect Answer 1', 'Pre-Revision Incorrect Answer 2',
       'Pre-Revision Incorrect Answer 3', 'Correct Answer',
       ]]
gpqa_QA.head()

Unnamed: 0,Question,Pre-Revision Incorrect Answer 1,Pre-Revision Incorrect Answer 2,Pre-Revision Incorrect Answer 3,Correct Answer
0,"A large gene has dozens of exons, of which the...",lariat,poly(A) tail,antisense,R-loops
1,Two quantum states with energies E1 and E2 hav...,10^-8 ev,10^-9 ev,10^-11 ev,10^-4 eV
2,trans-cinnamaldehyde was treated with methylma...,10,12,14,11
3,how many of the following compounds exhibit op...,3,5,6,4
4,A coating is applied to a substrate resulting ...,129°,134°,139°,124°


In [4]:
gpqa_QA_sample = gpqa_QA[:25]
gpqa_QA_sample

Unnamed: 0,Question,Pre-Revision Incorrect Answer 1,Pre-Revision Incorrect Answer 2,Pre-Revision Incorrect Answer 3,Correct Answer
0,"A large gene has dozens of exons, of which the...",lariat,poly(A) tail,antisense,R-loops
1,Two quantum states with energies E1 and E2 hav...,10^-8 ev,10^-9 ev,10^-11 ev,10^-4 eV
2,trans-cinnamaldehyde was treated with methylma...,10,12,14,11
3,how many of the following compounds exhibit op...,3,5,6,4
4,A coating is applied to a substrate resulting ...,129°,134°,139°,124°
5,Consider the following metric:\n\nds^{2}=\frac...,0,4\pi\left(x^{2}+y^{2}\right),4\pi\left(x^{2}-y^{2}\right),+\infty
6,"aniline is heated with sulfuric acid, forming ...",6,7,9,8
7,A spin-half particle is in a linear superposit...,7.2,8.6,9.1,-0.7
8,In a parallel universe where a magnet can have...,The ones related to the divergence and the cur...,The one related to the divergence of the magne...,The one related to the circulation of the magn...,The ones related to the circulation of the ele...
9,"In a cycloaddition reaction, two π systems com...","A = 5-methylcyclohex-3-ene-1-carbonitrile, B =...","A = cyclohexa-2,4-diene-1-carbonitrile, B = me...","A = cyclohexa-2,4-diene-1-carbonitrile, B = me...","A = 5-methylcyclohex-3-ene-1-carbonitrile, B =..."


In [5]:
def create_test_examples(dataframe):
    test_examples = []
    
    for index, row in dataframe.iterrows():
        question = row['Question']
        incorrect_answers = [
            row['Pre-Revision Incorrect Answer 1'],
            row['Pre-Revision Incorrect Answer 2'],
            row['Pre-Revision Incorrect Answer 3']
        ]
        correct_answer = row['Correct Answer']
        
        all_answers = incorrect_answers + [correct_answer]
        random.shuffle(all_answers)
        
        formatted_answers = ',\n'.join(all_answers)
        
        prompt = f"""
        {question}
        \n
        Your possible answers are:
        {formatted_answers}
        \n 
        Select the best possible answer. 
        """
        
        example = {
            "prompt": prompt,
            "iterations": 3
        }
        
        test_examples.append(example)
    
    return test_examples

# Generate the test examples
test_examples = create_test_examples(gpqa_QA_sample)
test_examples

[{'prompt': "\n        A large gene has dozens of exons, of which the central ones code for folded triple helical repeats that connect the cytoskeleton with sarcolemma and extracellular space. Each exon usually codes for one folded triple alpha helix. The most common mutations of the gene are central exon deletions that create out-of-frame peptides and progressive degenerative organ waste. A solution is to deliver a Morpholino that recognizes the 5' end of the out-of-frame exon in pre-mRNA. The molecule prevents binding of the spliceosome and creates exon skipping and in-frame joining. Several missing exons are well tolerated by an organism. Which structure below is not involved in the proposed therapy?\n        \n\n        Your possible answers are:\n        lariat,\nR-loops,\npoly(A) tail,\nantisense\n        \n \n        Select the best possible answer. \n        ",
  'iterations': 5},
 {'prompt': '\n        Two quantum states with energies E1 and E2 have a lifetime of 10^-9 sec and

In [6]:
refiner = SelfRefine(model="gpt-4o-mini")

results = []

for example in tqdm(test_examples, desc="Testing GPQA examples"):
    initial_text = refiner.generate_initial_text(example['prompt'])
    
    final_text = refiner.iterate_refinement(iterations=example['iterations'])
    
    results.append(final_text)

print(f"Completed testing {len(results)} examples")


Testing GPQA examples:   4%|▍         | 1/25 [00:38<15:30, 38.78s/it]

Model indicates no further refinements are necessary.


Testing GPQA examples:   4%|▍         | 1/25 [01:23<33:25, 83.58s/it]


KeyboardInterrupt: 

In [9]:
gpqa_QA_sample['SelfRefine_answers'] = results
gpqa_QA_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gpqa_QA_sample['SelfRefine_answers'] = results


Unnamed: 0,Question,Pre-Revision Incorrect Answer 1,Pre-Revision Incorrect Answer 2,Pre-Revision Incorrect Answer 3,Correct Answer,SelfRefine_answers
0,"A large gene has dozens of exons, of which the...",lariat,poly(A) tail,antisense,R-loops,The structure that is not involved in the prop...
1,Two quantum states with energies E1 and E2 hav...,10^-8 ev,10^-9 ev,10^-11 ev,10^-4 eV,To clearly distinguish between two quantum sta...
2,trans-cinnamaldehyde was treated with methylma...,10,12,14,11,To determine the number of carbon atoms in pro...
3,how many of the following compounds exhibit op...,3,5,6,4,To determine how many of the following compoun...
4,A coating is applied to a substrate resulting ...,129°,134°,139°,124°,To estimate the contact angle of a droplet of ...
5,Consider the following metric:\n\nds^{2}=\frac...,0,4\pi\left(x^{2}+y^{2}\right),4\pi\left(x^{2}-y^{2}\right),+\infty,To determine the area of the pseudosphere with...
6,"aniline is heated with sulfuric acid, forming ...",6,7,9,8,To determine the number of distinct non-exchan...
7,A spin-half particle is in a linear superposit...,7.2,8.6,9.1,-0.7,To determine the expectation value of the oper...
8,In a parallel universe where a magnet can have...,The ones related to the divergence and the cur...,The one related to the divergence of the magne...,The one related to the circulation of the magn...,The ones related to the circulation of the ele...,In a parallel universe where magnets can exist...
9,"In a cycloaddition reaction, two π systems com...","A = 5-methylcyclohex-3-ene-1-carbonitrile, B =...","A = cyclohexa-2,4-diene-1-carbonitrile, B = me...","A = cyclohexa-2,4-diene-1-carbonitrile, B = me...","A = 5-methylcyclohex-3-ene-1-carbonitrile, B =...",To thoroughly analyze the cycloaddition reacti...


In [10]:
from dotenv import load_dotenv
import os 
from openai import OpenAI

load_dotenv() 
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [13]:
gpt_4o_results = []
for example in tqdm(test_examples, desc="Testing GPQA examples"):

    response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages= [{"role": "system", "content": "You are a helpful assistant that generates high-quality text based on prompts."},
                            {"role": "user", "content": example['prompt']}],
                max_tokens=4096,
                temperature=0.7
            )
    ans = response.choices[0].message.content.strip()
    gpt_4o_results.append(ans)
gpt_4o_results

Testing GPQA examples: 100%|██████████| 25/25 [02:50<00:00,  6.80s/it]


['The structure that is not involved in the proposed therapy is the **poly(A) tail**. \n\nIn the context of the Morpholino therapy described, the Morpholino acts as an antisense oligonucleotide that binds to the pre-mRNA, preventing the spliceosome from recognizing the out-of-frame exon. This leads to exon skipping and in-frame joining of the remaining exons. R-loops may form as a result of the binding of the Morpholino and other interactions during transcription. The lariat structure is involved in the splicing process itself. However, the poly(A) tail is primarily associated with the stability and transport of mature mRNA and does not play a direct role in the mechanism of exon skipping described in the therapy.',
 "To determine the energy difference that allows for the clear distinction between two quantum states with given lifetimes, we can use the energy-time uncertainty principle, which states:\n\n\\[\n\\Delta E \\cdot \\Delta t \\geq \\frac{\\hbar}{2}\n\\]\n\nwhere \\(\\Delta E\

In [14]:
gpqa_QA_sample['4o_mini_answers'] = gpt_4o_results
gpqa_QA_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gpqa_QA_sample['4o_mini_answers'] = gpt_4o_results


Unnamed: 0,Question,Pre-Revision Incorrect Answer 1,Pre-Revision Incorrect Answer 2,Pre-Revision Incorrect Answer 3,Correct Answer,SelfRefine_answers,4o_mini_answers
0,"A large gene has dozens of exons, of which the...",lariat,poly(A) tail,antisense,R-loops,The structure that is not involved in the prop...,The structure that is not involved in the prop...
1,Two quantum states with energies E1 and E2 hav...,10^-8 ev,10^-9 ev,10^-11 ev,10^-4 eV,To clearly distinguish between two quantum sta...,To determine the energy difference that allows...
2,trans-cinnamaldehyde was treated with methylma...,10,12,14,11,To determine the number of carbon atoms in pro...,To determine the number of carbon atoms in pro...
3,how many of the following compounds exhibit op...,3,5,6,4,To determine how many of the following compoun...,To determine how many of the listed compounds ...
4,A coating is applied to a substrate resulting ...,129°,134°,139°,124°,To estimate the contact angle of a droplet of ...,To estimate the contact angle of a droplet of ...
5,Consider the following metric:\n\nds^{2}=\frac...,0,4\pi\left(x^{2}+y^{2}\right),4\pi\left(x^{2}-y^{2}\right),+\infty,To determine the area of the pseudosphere with...,To find the area of the pseudosphere described...
6,"aniline is heated with sulfuric acid, forming ...",6,7,9,8,To determine the number of distinct non-exchan...,To determine the number of distinct non-exchan...
7,A spin-half particle is in a linear superposit...,7.2,8.6,9.1,-0.7,To determine the expectation value of the oper...,To find the expectation value of the operator ...
8,In a parallel universe where a magnet can have...,The ones related to the divergence and the cur...,The one related to the divergence of the magne...,The one related to the circulation of the magn...,The ones related to the circulation of the ele...,In a parallel universe where magnets can exist...,In a parallel universe where magnets can have ...
9,"In a cycloaddition reaction, two π systems com...","A = 5-methylcyclohex-3-ene-1-carbonitrile, B =...","A = cyclohexa-2,4-diene-1-carbonitrile, B = me...","A = cyclohexa-2,4-diene-1-carbonitrile, B = me...","A = 5-methylcyclohex-3-ene-1-carbonitrile, B =...",To thoroughly analyze the cycloaddition reacti...,To determine the correct products for the give...


In [15]:
gpqa_QA_sample.to_csv("gpqa_QA_sample.csv")

In [27]:
with open('qa_comparison_results.md', 'w') as file:
    for index, row in gpqa_QA_sample[[
        'Correct Answer', 'SelfRefine_answers', '4o_mini_answers'
    ]].iterrows():
        file.write(f"\n## Question #{index}\n\n")
        file.write(f"### Correct Answer:\n{row['Correct Answer']}\n\n")
        file.write(f"### SelfRefine Answer:\n{row['SelfRefine_answers']}\n\n")
        file.write(f"### 4o-mini Answer:\n{row['4o_mini_answers']}\n\n")
        file.write("---\n")