In [2]:
import pandas as pd
import json
import os
import re
from openai import OpenAI #estamos la clase concreta OpenAI del módulo openai
from dotenv import load_dotenv #importamos una función concreta del módulo
load_dotenv("template.env")

True

In [3]:
# Acceder a la clave de API de OpenAI
api_key = os.getenv("OPENAI_API_KEY")

# Asegurarte de que la clave de API se haya cargado correctamente
if api_key is None:
    raise ValueError('La clave de API no está configurada en el archivo .env')
    
client = OpenAI() #creando un objeto de la clase

In [4]:
dataset_folder = os.getenv('DATASET_FOLDER')
dataset_path = str(dataset_folder) + "MMLU_pro.xlsx"

df = pd.read_excel(dataset_path)
#df = df.sample(30)
df.head()

Unnamed: 0,question_id,question,options,answer,answer_index,cot_content,category,src
0,70,"Typical advertising regulatory bodies suggest,...","['Safe practices, Fear, Jealousy, Trivial'\n '...",I,,8.0,business,ori_mmlu-business_ethics
1,71,Managers are entrusted to run the company in t...,"['Shareholders, Diligence, Self-interest'\n 'S...",F,5.0,,business,ori_mmlu-business_ethics
2,72,There are two main issues associated with ____...,"['Down, Autonomy, Remuneration, Benefit'\n 'Do...",J,9.0,,business,ori_mmlu-business_ethics
3,73,_______ locate morality beyond the sphere of r...,['Ethical egoism' 'Ethics of duty' 'Postmodern...,C,2.0,,business,ori_mmlu-business_ethics
4,74,Some of key differences between Islamic finan...,"['Interest, Certain, Assured, Both tangible an...",G,6.0,,business,ori_mmlu-business_ethics


In [5]:
#FUNCTION DECLARATION

#MODEL = "gpt-4o-mini"
MODEL = "gpt-4.1-nano"

#Generate description of multiple question
def generate_description(instr,op_a,op_b,op_c,op_d,op_e,op_f,op_g,op_h,op_i,op_j):
	description = json.dumps({
        "instruction": instr,
        "options": {
            "A": op_a,
            "B": op_b,
            "C": op_c,
            "D": op_d,
			"E": op_e,
            "F": op_f,
            "G": op_g,
            "H": op_h,
			"I": op_i,
            "J": op_j
        }
    })
	return description

#Generate one task
def generate_task(index,prompt,desc):
	task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # Esto es lo que tendrías en tu llamada a la API de Chat Completions
            "model": MODEL,
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": desc
                }
            ],
        }
    }
	return task

def extract_basic(new_line):
	new_line = new_line["body"]["messages"][1]["content"]
	return json.loads(new_line)

def create_file_from_tasks(tasks,file_name):
	with open(file_name, 'w') as file:
		for obj in tasks:
			file.write(json.dumps(obj) + '\n')

def create_batch(file_name):
	batch_file = client.files.create(
		file = open(file_name, "rb"),
		purpose = "batch"
	)
	batch_job = client.batches.create(
		input_file_id = batch_file.id,
		endpoint = "/v1/chat/completions",
		completion_window = "24h"
	)
	return batch_job

def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\t]", "", text)
        return text.strip()
    return text

def del_ch(string,array):
	string = str(string)
	for ch in array:
		string = string.replace(ch,"")
	return string

def df_array_divide(json_object,array_name):
	option_let = ["A","B","C","D","E","F","G","H","I","J","K","L"]
	keys = json_object.keys()
	res = "{"
	array = ""

	ch_del = ["'","\n","[","]","\\","\""]
	for i in range(0,len(keys)):
		if (keys[i] == array_name):
			array = json_object.values[i]
		else:
			value = del_ch(json_object.values[i],ch_del)
			res += f"\"{keys[i]}\":\"{value}\","

	ch_del2 = ["\n","[","]","\\"]
	array = del_ch(array,ch_del2)
	array = array.replace("\"","'")
	array = array.split("' '")

	for i in range(0,len(array)):
		array[i] = del_ch(array[i],ch_del)
	
	for i in range(0,10):
		try:
			res += f"\"{option_let[i]}\":\"{array[i]}\","
		except:
			res += f"\"{option_let[i]}\":\"not the answer\","
	
	res = res[:-1] +"}"
	res = clean_text(res)
	try:
		res = json.loads(res)
	except:
		print(res)
	return res

def create_task_from_json(json_object,index,prompt,rename_func=None):
	if (isinstance(json_object,str)):
		json_object = json.loads(json_object)
	if (rename_func == None):
		rename_func = ['instruction',
				 'option_a','option_b','option_c','option_d','option_e',
				 'option_f','option_g','option_h','option_i','option_j']
		
	try:
		instruction = json_object[rename_func[0]]
		option_a = json_object[rename_func[1]]
		option_b = json_object[rename_func[2]]
		option_c = json_object[rename_func[3]]
		option_d = json_object[rename_func[4]]
	except:
		print(json_object)
		print(index)

	try:
		option_e = json_object[rename_func[5]]
	except:
		option_e = "not the answer"
	try:
		option_f = json_object[rename_func[6]]
	except:
		option_f = "not the answer"
	try:
		option_g = json_object[rename_func[7]]
	except:
		option_g = "not the answer"
	try:
		option_h = json_object[rename_func[8]]
	except:
		option_h = "not the answer"
	try:
		option_i = json_object[rename_func[9]]
	except:
		option_i = "not the answer"
	try:
		option_j = json_object[rename_func[10]]
	except:
		option_j = "not the answer"

	description = generate_description(
		instruction, option_a, option_b, option_c, option_d, option_e, option_f, option_g, option_h,option_i, option_j,
	)
    
	task = generate_task(
		index, prompt, description,
	)
	return task

def create_task_array_from_dataframe(df,prompt,rename_func = None):
	tasks = []
	for index, row in df.iterrows():
		row = df_array_divide(row,"options")
		task = create_task_from_json(row,index,prompt,rename_func)
		tasks.append(task)
	return tasks

def extract_data_paraphrase(new_line):
	res = new_line["response"]["body"]["choices"][0]["message"]["content"]
	res = json.loads(res)
	return res

def get_line_file(file_name,line,extract_func):
	with open(file_name, 'r') as f:
		for line_number, theline in enumerate(f):
			if line_number == line:
				res = theline
				break
	res = json.loads(res)
	return extract_func(res)

def extract_none(line):return line

def empty_task(index,prompt):
	return generate_task(
		index,prompt,generate_description("","","","","","","","","","",""),
	)

def create_task_array_from_filename(file_name,prompt,extract_func,rename_func,error_file):
	tasks = []
	error_lines = []
	with open(file_name, 'r') as f:
		lines = len(f.readlines())

	for i in range(0,lines):
		try:
			line_new = get_line_file(file_name,i,extract_func)
			task = create_task_from_json(line_new,i,prompt,rename_func)
			tasks.append(task)
		except:
			line_new = get_line_file(file_name.replace("_result",""),i,extract_none)
			with open(error_file, 'a') as f:
				f.write(json.dumps(line_new) + '\n')
			error_lines.append(i)
			tasks.append(empty_task(i,prompt))
			#print(f"error in line[{i}]:\n{line_new}")
	print(f"number of errors = {len(error_lines)}")
	print(error_lines)
	
	return tasks



In [6]:
#PROMPTS

#PARAPHRASING PROMPT
categorize_system_prompt_paraphrase ='''
Your goal is to paraphrase multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the reworded question.

For each row paraphrase the question while maintaining the original meaning and paraphrase the options as well. Ensure the paraphrase is at least 500 characters long between options and question.

Do not paraphrase options that are 'not the answer'

The output JSON objects should be in the following format:

{paraphrased_question: string, option_a: string, option_b: string, option_c: string, option_d: string, option_e: string, option_f: string, option_g: string, option_h: string, option_i: string, option_j: string,}

Ensure that the question is presented differently but conveys the same idea. Keep the JSON format in the output with '{' and '}'.
'''

#ANSWER PROMPT
categorize_system_prompt_answer = '''
Your goal is to evaluate multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the evaluation of the question.

For each row evaluate the question.

The output JSON objects should be in the following format:

{answer: string, // The selected option key for the question, limited to 'A', 'B', 'C', 'D' , 'E', 'F', 'G', 'H' , 'I' or 'J'}

Keep the JSON format in the answer with '{' and '}'.
'''

In [7]:
#Set output folder
output_folder = os.getenv("OUTPUT_FOLDER")
def out_file(file_name): return (str(output_folder) + file_name)

In [8]:
#PARAPHRASING TASK
r_func = ["question","A","B","C","D","E","F","G","H","I","J"]
tasks_array = [create_task_array_from_dataframe(df,categorize_system_prompt_paraphrase,r_func)]
file_array = [out_file("batch_job_mmlu_nano_pro_paraphrase.jsonl")]

In [8]:
#ANSWERING TASK
r_func = ["question","A","B","C","D","E","F","G","H","I","J"]
file_name = out_file("batch_job_mmlu_nano_pro_paraphrase_result.jsonl")
rename_func = ['paraphrased_question','option_a','option_b','option_c','option_d','option_e','option_f','option_g','option_h','option_i','option_j']
error_file_name = out_file("batch_job_mmlu_nano_pro_paraphrase_error.jsonl")
error_res_file = error_file_name.replace(".json","_result.json")

In [9]:
paraphrased_task = create_task_array_from_filename(file_name,categorize_system_prompt_answer,extract_data_paraphrase,rename_func,error_file_name)

{'paraphrased_question': 'In a binomial model that spans two periods, each lasting half a year, suppose you are analyzing the current valuation of a stock that does not pay dividends, which is priced at $70.00 today. The model parameters include an upward movement factor, u, equal to 1.181, indicating that if the stock price increases, it does so by a factor of 1.181 per period, reflecting a certain percentage gain. Conversely, the downward movement factor, d, is 0.890, meaning that if the stock price decreases, it is scaled down by this factor, representing a percentage loss per period. The risk-free interest rate, compounded continuously, is given as 5% annually. Given these parameters, what is the current fair value of an American put option with a strike price of $80.00 that expires in one year? The options available for this valuation are: A) $5.95, B) $8.40, C) $17.30, D) $12.50, E) $20.25, F) $25.00, G) $13.80, H) $22.15, I) $10.75, J) $15.50.'}
64
{'paraphrased_question': "A re

In [10]:
ba_jo_error= create_batch(error_file_name)

In [15]:
batch = ba_jo_error
batch = client.batches.retrieve(batch.id)
print(batch)
result_file_id = batch.output_file_id
print(batch.status)

Batch(id='batch_680a5b33bb54819087d490437795bdc1', completion_window='24h', created_at=1745509171, endpoint='/v1/chat/completions', input_file_id='file-N5AmTPUisMxUT5asy34Hfu', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1745509495, error_file_id=None, errors=None, expired_at=None, expires_at=1745595571, failed_at=None, finalizing_at=1745509489, in_progress_at=1745509172, metadata=None, output_file_id='file-W9DDyfraeZLN3LeBAkUTTf', request_counts=BatchRequestCounts(completed=80, failed=0, total=80))
completed


In [16]:
batch = ba_jo_error
batch = client.batches.retrieve(batch.id)
result_file_id = batch.output_file_id

result = client.files.content(result_file_id).content

with open(error_res_file, 'wb') as file:
	file.write(result)

In [17]:
phr_error = []

with open(error_file_name,"r") as f:
	lines_in_error_file = len(f.readlines())

for i in range(0,lines_in_error_file):
	try:
		error_line = get_line_file(error_res_file,i,extract_none)
		task_n = int(error_line["custom_id"].replace("task-",""))
		new_task = create_task_from_json(extract_data_paraphrase(error_line),task_n,categorize_system_prompt_answer,rename_func)
		paraphrased_task[task_n] = new_task
	except:
		error_line = get_line_file(error_file_name,i,extract_none)
		phr_error.append(error_line)

for i in range(0,len(phr_error)):
	print(phr_error[i])

{'paraphrased_question': 'Given the density of water vapor measured at a pressure of 327.6 atmospheres and a temperature of 776.4 Kelvin, which is 133.2 kilograms per cubic meter, and considering the critical temperature and pressure of water as 647.4 Kelvin and 218.3 atmospheres respectively, along with the van der Waals constants a = 5.464 dm^6 atm mol^-2, b = 0.03049 dm^3 mol^-1, and molar mass M = 18.02 g mol^-1, determine the value of the compression factor derived from the virial expansion of the van der Waals equation. The options provided are: A: 0.9283, B: 0.6889, C: 0.5594, D: 0.8457, E: 0.7671, F: 0.7846, G: 0.6523, H: 0.8312, I: 0.7158, J: 0.6017.'}
4149
{'custom_id': 'task-1013', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4.1-nano', 'temperature': 0, 'response_format': {'type': 'json_object'}, 'messages': [{'role': 'system', 'content': "\nYour goal is to paraphrase multiple choice questions from JSON objects. You will be provided with a JSON ob

In [18]:
tasks_array = [
	create_task_array_from_dataframe(df,categorize_system_prompt_answer,r_func),
	paraphrased_task,
	]
file_array = [out_file("batch_job_mmlu_nano_pro_answer_original.jsonl"),out_file("batch_job_mmlu_nano_pro_answer_paraphrase.jsonl")]

In [19]:
#GENERATE TASK FILES
for i in range(0,len(tasks_array)):
	create_file_from_tasks(tasks_array[i],file_array[i])

In [20]:
#GENERATE BATCH
batch_jobs = []
for i in range(0,len(tasks_array)):
	ba_jo= create_batch(file_array[i])
	batch_jobs.append(ba_jo)

In [25]:
#COMPLETION_CHECK
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	print(batch)
	result_file_id = batch.output_file_id
	print(batch.status)

Batch(id='batch_680a5e3a15e081909e0ecfd7da658d28', completion_window='24h', created_at=1745509946, endpoint='/v1/chat/completions', input_file_id='file-AxJKCDKYcucjhtqzCwkWuv', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1745513768, error_file_id=None, errors=None, expired_at=None, expires_at=1745596346, failed_at=None, finalizing_at=1745512312, in_progress_at=1745509951, metadata=None, output_file_id='file-G25ET4uv5BQJLq9cPiDfaz', request_counts=BatchRequestCounts(completed=12102, failed=0, total=12102))
completed
Batch(id='batch_680a5e70697081909e913dc0e7bd16b5', completion_window='24h', created_at=1745510000, endpoint='/v1/chat/completions', input_file_id='file-4Cox46P2xW6jtykd66QjRg', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1745513082, error_file_id=None, errors=None, expired_at=None, expires_at=1745596400, failed_at=None, finalizing_at=1745511925, in_progress_at=1745510006, metadata=None, o

In [26]:
#OUTPUT FILES GENERATOR
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	result_file_id = batch.output_file_id

	result = client.files.content(result_file_id).content

	result_file_name = file_array[i].replace(".json","_result.json")

	with open(result_file_name, 'wb') as file:
		file.write(result)

In [31]:
#Cleaning function
def clean():
	with open(out_file("batch_job_mmlu_nano_pro_paraphrase.jsonl"), 'r') as f:
		lines = len(f.readlines())
	
	rows = []
	stats = [0,0]

	f_o = out_file("batch_job_mmlu_nano_pro_paraphrase.jsonl")
	f_ph = out_file("batch_job_mmlu_nano_pro_answer_paraphrase.jsonl")
	f_a = out_file("batch_job_mmlu_nano_pro_answer_original_result.jsonl")
	f_ph_a = out_file("batch_job_mmlu_nano_pro_answer_paraphrase_result.jsonl")

	for i in range(0,lines):
		if(True):
		#try:
			dt_o = get_line_file(f_o,i,extract_basic)
			dt_ph = get_line_file(f_ph,i,extract_basic)
			dt_a = get_line_file(f_a,i,extract_data_paraphrase)
			dt_ph_a = get_line_file(f_ph_a,i,extract_data_paraphrase)

			row = df.iloc[i]
			rows.append([
      	 	    dt_o['instruction'],
     	    	dt_o['options']["A"],
    	     	dt_o['options']["B"],
    	     	dt_o['options']["C"],
    	     	dt_o['options']["D"],
				dt_o['options']["E"],
   		      	dt_o['options']["F"],
   	  	    	dt_o['options']["G"],
        	 	dt_o['options']["H"],
				dt_o['options']["I"],
      	 	  	dt_o['options']["J"],
      		   	row['answer'],
				clean_text(dt_a["answer"]),
				clean_text(dt_ph['instruction']),
				clean_text(dt_ph['options']["A"]),
				clean_text(dt_ph['options']["B"]),
				clean_text(dt_ph['options']["C"]),
				clean_text(dt_ph['options']["D"]),
				clean_text(dt_ph['options']["E"]),
				clean_text(dt_ph['options']["F"]),
				clean_text(dt_ph['options']["G"]),
				clean_text(dt_ph['options']["H"]),
				clean_text(dt_ph['options']["I"]),
				clean_text(dt_ph['options']["J"]),
				clean_text(dt_ph_a["answer"]),
			])

			if(row['answer'] == dt_a["answer"]): stats[0]+=1
			if(row['answer'] == dt_ph_a["answer"]): stats[1]+=1
		#except:
		#	print(i)
	
	stats[0] = stats[0]/lines
	stats[1] = stats[1]/lines
	
	stats = pd.DataFrame([{'acuracy': stats[0],'acuracy_ph': stats[1],}])
	return pd.DataFrame(rows),stats


clean_dtset,stat = clean()

In [32]:
file_name = out_file('FinalResults_pro_nano.xlsx')
with pd.ExcelWriter(file_name) as writer:
	clean_dtset.to_excel(writer, sheet_name='Results',index=False)
	stat.to_excel(writer, sheet_name='Stats',engine='xlsxwriter',index=False)