## JEE Bench using Chain of Thought.

In [None]:
import json
import pickle
from tqdm import trange
import time
import pandas as pd
import os
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff
import openai

openai.api_type = "azure"
openai.api_base = ""
openai.api_version = ""
openai.api_key = ""

with open("./data/dataset.json", 'r') as f:
    l = json.load(f)
    

prompts_dic = {
"MCQ": "In this problem, only one option will be correct. Give a detailed solution and end the solution with the final answer.",
"MCQ(multiple)": "In this problem, multiple options can be correct. Give a detailed solution and end the solution with the final answer.",  
"Integer": "In this problem, the final answer will be a non-negative integer. Give a detailed solution and end the solution with the final answer.",
"Numeric": "In this problem, the final will be a numeric value. Give the numerical answer correct upto the 2nd decimal digit. Give a detailed solution and end the solution with the final answer."
}

ans_dic = {
"MCQ": "A or B or C or D.",
"MCQ(multiple)": "if correct options are A, B and D give ABD alphabetically.",  
"Integer": "Int number only.",
"Numeric": "Float number only."
}


def get_prompt(question: dict):
    
    p = prompts_dic[question['type']]
    
    f_text = f"""\n\nGive final answer as Final_answer: {ans_dic[question['type']]}"""
    
    return p + "\n\nProblem: " + question['question'] + f_text + "\n\nLet’s think step by step", question['gold']
    #                                                              (Chain of thought prompt).


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def output(text):
    
    response = openai.ChatCompletion.create(
      engine="gpt-4-32k",
      messages= [
        {"role": "user", "content": text}
        ],
      temperature=0,            # reproducible with temp 0.
      max_tokens=3048,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    return response.choices[0].message.content


df = pd.DataFrame({
'description':[],
'index':[],
'subject':[],
'type':[],
'question':[], 
'gold':[],
'pred':[]
})


for i in trange(len(l)):
    
    try:
        q, a =  get_prompt(l[i]) # df.iloc[i]['question'], df.iloc[i]['gold']
        res = output(q)
        l[i]['pred'] = res 

        df.loc[len(df)] = (l[i]['description'], l[i]['index'], l[i]['subject'], l[i]['type'], l[i]['question'], l[i]['gold'], l[i]['pred'])

        if i% 10 == 0:
            df.to_excel('Jee_COT.xlsx', index=False)
    
    except Exception as e:
        print(f"Exception {e} at {i}")
        
        

df.to_excel('./data/Jee_COT_output.xlsx', index=False)

## JEEBench using COT.

1. Run COT.py to generate the gpt output for all questions and saves in **Jee_COT.xlsx**.
2. Run COT_Scoring.py to manually score the GPT4 generations comparing with GOLD and generate **Jee_COT_Scores.xlsx**.
3. Run this notebook to see the scores for each Subject and Category.

In [1]:
import pandas as pd

Jee_cot_generations = pd.read_excel("./output/Jee_COT_output.xlsx")
Scores = pd.read_excel("./output/Jee_COT_Scores.xlsx")

In [2]:
Scores.groupby(['subject'])['Score'].mean()

subject
chem    0.420290
math    0.211823
phy     0.252336
Name: Score, dtype: float64

In [3]:
Scores.groupby(['type'])['Score'].mean()

type
Integer          0.172414
MCQ              0.373494
MCQ(multiple)    0.329545
Numeric          0.221374
Name: Score, dtype: float64

In [4]:
Scores['Score'].mean()

0.2857142857142857