In [2]:
import pandas as pd
import json
import os
from openai import OpenAI #estamos la clase concreta OpenAI del módulo openai
from dotenv import load_dotenv #importamos una función concreta del módulo
load_dotenv("template.env")

True

In [3]:
# Acceder a la clave de API de OpenAI
api_key = os.getenv("OPENAI_API_KEY")

# Asegurarte de que la clave de API se haya cargado correctamente
if api_key is None:
    raise ValueError("La clave de API no está configurada en el archivo .env")
    
client = OpenAI() #creando un objeto de la clase

In [4]:
dataset_folder = os.getenv("DATASET_FOLDER")
dataset_path = str(dataset_folder) + "MMLU_completo.xlsx"

df = pd.read_excel(dataset_path)
#df = df.sample(20)
df.head()

Unnamed: 0,instruction,option_a,option_b,option_c,option_d,answer,id
0,For which of these two scenarios does the main...,"Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",D,moral_scenarios_test
1,For which of these two scenarios does the main...,"Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",A,moral_scenarios_test
2,For which of these two scenarios does the main...,"Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",D,moral_scenarios_test
3,For which of these two scenarios does the main...,"Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",B,moral_scenarios_test
4,For which of these two scenarios does the main...,"Wrong, Wrong","Wrong, Not wrong","Not wrong, Wrong","Not wrong, Not wrong",D,moral_scenarios_test


In [5]:
#FUNCTION DECLARATION

#MODEL = "gpt-4o-mini"
#MODEL = "gpt-4o"
MODEL = "gpt-4.1-nano"

#Generate description of multiple question
def generate_description(instr,op_a,op_b,op_c,op_d):
	description = json.dumps({
        "instruction": instr,
        "options": {
            "A": op_a,
            "B": op_b,
            "C": op_c,
            "D": op_d
        }
    })
	return description

#Generate one task
def generate_task(index,prompt,desc):
	task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # Esto es lo que tendrías en tu llamada a la API de Chat Completions
            "model": MODEL,
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": desc
                }
            ],
        }
    }
	return task

def extract_data_paraphrase(new_line):
	res = new_line["response"]["body"]["choices"][0]["message"]["content"]
	res = json.loads(res)
	return res

def get_keys(json_object):
	res = []
	for i in json_object:
		res.append(i)
	return res

def create_task_from_json(json_object,index,prompt,rename_func=None):
	if (rename_func == None):
		rename_func = ['instruction','option_a','option_b','option_c','option_d']
	try:
		instruction = json_object[rename_func[0]]
		option_a = json_object[rename_func[1]]
		option_b = json_object[rename_func[2]]
		option_c = json_object[rename_func[3]]
		option_d = json_object[rename_func[4]]
	except:
		#print(json_object)
		#print(index)
		pass

	description = generate_description(
		instruction, option_a, option_b, option_c, option_d,
	)
    
	task = generate_task(
		index, prompt,description,
	)
	return task

def create_task_array_from_dataframe(df,prompt):
	tasks = []
	for index, row in df.iterrows():
		task = create_task_from_json(row,index,prompt)
		tasks.append(task)
	return tasks


def create_file_from_tasks(tasks,file_name):
	with open(file_name, 'w') as file:
		for obj in tasks:
			file.write(json.dumps(obj) + '\n')


def create_batch(file_name):
	batch_file = client.files.create(
		file = open(file_name, "rb"),
		purpose = "batch"
	)
	batch_job = client.batches.create(
		input_file_id = batch_file.id,
		endpoint = "/v1/chat/completions",
		completion_window = "24h"
	)
	return batch_job

def get_line_file(file_name,line,extract_func):
	with open(file_name, 'r') as f:
		for line_number, theline in enumerate(f):
			if line_number == line:
				res = theline
				break
	res = json.loads(res)
	return extract_func(res)


def extract_none(line):return line

def empty_task(index,prompt):
	return generate_task(
		index,prompt,generate_description("","","","",""),
	)

def create_task_array_from_filename(file_name,prompt,extract_func,rename_func,error_file):
	tasks = []
	error_lines = []
	with open(file_name, 'r') as f:
		lines = len(f.readlines())

	for i in range(0,lines):
		try:
			line_new = get_line_file(file_name,i,extract_func)
			task = create_task_from_json(line_new,i,prompt,rename_func)
			tasks.append(task)
		except:
			line_new = get_line_file(file_name.replace("_result",""),i,extract_none)
			with open(error_file, 'a') as f:
				f.write(json.dumps(line_new) + '\n')
			error_lines.append(i)
			tasks.append(empty_task(i,prompt))
			#print(f"error in line[{i}]:\n{line_new}")
	print(f"number of errors = {len(error_lines)}")
	print(error_lines)
	return tasks


In [6]:
#PROMPTS

#PARAPHRASING PROMPT
categorize_system_prompt_paraphrase ='''
Your goal is to paraphrase multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the reworded question.

For each row paraphrase the question while maintaining the original meaning and paraphrase the options as well. Ensure the paraphrase is at least 500 characters long between options and question.

The output JSON objects should be in the following format:

{paraphrased_question: string, option_a: string, option_b: string, option_c: string, option_d: string,}

Ensure that the question is presented differently but conveys the same idea. Keep the JSON format in the output with '{' and '}'.
'''

#ANSWER PROMPT
categorize_system_prompt_answer = '''
Your goal is to evaluate multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the evaluation of the question.

For each row evaluate the question.

The output JSON objects should be in the following format:

{answer: string, // The selected option key for the question, limited to 'A', 'B', 'C', or 'D'}

Keep the JSON format in the answer with '{' and '}'.
'''

In [7]:
#Set output folder
output_folder = os.getenv("OUTPUT_FOLDER")
def out_file(file_name): return (str(output_folder) + file_name)

In [None]:
#PARAPHRASING TASK
tasks_array = [create_task_array_from_dataframe(df,categorize_system_prompt_paraphrase)]
file_array = [out_file("batch_job_mmlu_nano_paraphrase.jsonl")]

In [9]:
file_name = out_file("batch_job_mmlu_nano_paraphrase_result.jsonl")
rename_func = ['paraphrased_question','option_a','option_b','option_c','option_d']

error_file_name = out_file("batch_job_mmlu_answer_paraphrase_error.jsonl")

error_res_file = error_file_name.replace(".json","_result.json")

In [8]:
paraphrased_task = create_task_array_from_filename(file_name,categorize_system_prompt_answer,extract_data_paraphrase,rename_func,error_file_name)

{'paraphrased_question': "Create a comprehensive truth table to analyze the logical structure of the given argument. After constructing the table, assess whether the argument holds as valid or invalid based on the truth values. If the argument is found to be invalid, select the option that provides a specific counterexample demonstrating its invalidity. Keep in mind that multiple counterexamples might exist, so consider different combinations of truth values for the involved propositions.\n\nThe argument involves the following premises: 'If P then Q' and 'Q and R', leading to the conclusion '~P and R'. Using the truth table, determine the validity of this argument and identify any counterexamples if it is invalid.\n\nOptions:\nA. The argument is valid.\nB. The argument is invalid; a counterexample occurs when P, R, and Q are all true.\nC. The argument is invalid; a counterexample occurs when P and Q are true, but R is false.\nD. The argument is invalid; a counterexample occurs when R a

In [None]:
ba_jo_error= create_batch(error_file_name)

In [None]:
batch = ba_jo_error
batch = client.batches.retrieve(batch.id)
print(batch)
result_file_id = batch.output_file_id
print(batch.status)

In [None]:
batch = ba_jo_error
batch = client.batches.retrieve(batch.id)
result_file_id = batch.output_file_id

result = client.files.content(result_file_id).content

with open(error_res_file, 'wb') as file:
	file.write(result)

In [17]:
phr_error = []

with open(error_file_name,"r") as f:
	lines_in_error_file = len(f.readlines())

for i in range(0,lines_in_error_file):
	try:
		error_line = get_line_file(error_res_file,i,extract_none)
		task_n = int(error_line["custom_id"].replace("task-",""))
		new_task = create_task_from_json(extract_data_paraphrase(error_line),task_n,categorize_system_prompt_answer,rename_func)
		paraphrased_task[task_n] = new_task
	except:
		error_line = get_line_file(error_file_name,i,extract_none)
		phr_error.append(error_line)

for i in range(0,len(phr_error)):
	print(phr_error[i])

{'custom_id': 'task-3973', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4.1-nano', 'temperature': 0, 'response_format': {'type': 'json_object'}, 'messages': [{'role': 'system', 'content': "\nYour goal is to paraphrase multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the reworded question.\n\nFor each row paraphrase the question while maintaining the original meaning and paraphrase the options as well. Ensure the paraphrase is at least 500 characters long between options and question.\n\nThe output JSON objects should be in the following format:\n\n{paraphrased_question: string, option_a: string, option_b: string, option_c: string, option_d: string,}\n\nEnsure that the question is presented differently but conveys the same idea. Keep the JSON format in the output with '{' and '}'.\n"}, {'role': 'user', 'content': '{"instruction": "Find the degree fo

In [32]:
paraphrased_task
print("")




In [None]:
replace_file_name = out_file("batch_job_mmlu_nano_paraphrase_result.jsonl")
create_file_from_tasks(paraphrased_task,replace_file_name)

In [19]:
#ANSWERING TASK
tasks_array = [
	create_task_array_from_dataframe(df,categorize_system_prompt_answer),
	paraphrased_task
	]
file_array = [out_file("batch_job_mmlu_nano_answer_original.jsonl"),out_file("batch_job_mmlu_nano_answer_paraphrase.jsonl")]

In [20]:
#GENERATE TASK FILES
for i in range(0,len(tasks_array)):
	create_file_from_tasks(tasks_array[i],file_array[i])

In [21]:
#GENERATE BATCH
batch_jobs = []
for i in range(0,len(tasks_array)):
	ba_jo= create_batch(file_array[i])
	batch_jobs.append(ba_jo)

In [27]:
#COMPLETION_CHECK
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	print(batch)
	result_file_id = batch.output_file_id
	print(batch.status)

Batch(id='batch_6803b38bb1f881909e935ac89af798ce', completion_window='24h', created_at=1745073035, endpoint='/v1/chat/completions', input_file_id='file-1muzSfjBcraVB5A54UEUoc', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1745077458, error_file_id=None, errors=None, expired_at=None, expires_at=1745159435, failed_at=None, finalizing_at=1745075901, in_progress_at=1745073039, metadata=None, output_file_id='file-7oZou4ZxZqimLNRBgXsJuY', request_counts=BatchRequestCounts(completed=14042, failed=0, total=14042))
completed
Batch(id='batch_6803b3ed28848190bdbdf70f28bf3fa8', completion_window='24h', created_at=1745073133, endpoint='/v1/chat/completions', input_file_id='file-Der2LfDvDjckBvA9JfrLAM', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1745076738, error_file_id=None, errors=None, expired_at=None, expires_at=1745159533, failed_at=None, finalizing_at=1745075576, in_progress_at=1745073137, metadata=None, o

In [28]:
#OUTPUT FILES GENERATOR
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	result_file_id = batch.output_file_id

	result = client.files.content(result_file_id).content

	result_file_name = file_array[i].replace(".json","_result.json")

	with open(result_file_name, 'wb') as file:
		file.write(result)

In [None]:
import re

def clean_text(text):
    if isinstance(text, str):
        # Eliminar caracteres de control ASCII (excepto saltos de línea/tabulación)
        text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", text)
        return text.strip()
    return text

#Cleaning function
def clean():
	with open(out_file("batch_job_mmlu_nano_paraphrase.jsonl"), 'r') as f:
		lines = len(f.readlines())
	
	rows = []
	stats = [0,0]

	f_ph = out_file("batch_job_mmlu_nano_paraphrase_result.jsonl")
	f_a = out_file("batch_job_mmlu_nano_answer_original_result.jsonl")
	f_ph_a = out_file("batch_job_mmlu_nano_answer_paraphrase_result.jsonl")

	for i in range(0,lines):
		try:
			dt_ph = get_line_file(f_ph,i,extract_data_paraphrase)
			dt_a = get_line_file(f_a,i,extract_data_paraphrase)
			dt_ph_a = get_line_file(f_ph_a,i,extract_data_paraphrase)

			row = df.iloc[i]
			rows.append([
     	  	    row['instruction'],
     	    	row['option_a'],
     	    	row['option_b'],
     	    	row['option_c'],
    	     	row['option_d'],
    	     	row['answer'],
				clean_text(dt_a["answer"]),
				clean_text(dt_ph["paraphrased_question"]),
				clean_text(dt_ph["option_a"]),
				clean_text(dt_ph["option_b"]),
				clean_text(dt_ph["option_c"]),
				clean_text(dt_ph["option_d"]),
				clean_text(dt_ph_a["answer"]),
			])

			if(row['answer'] == dt_a["answer"]): stats[0]+=1
			if(row['answer'] == dt_ph_a["answer"]): stats[1]+=1
		except:
			print(i)
	
	
	stats[0] = stats[0]/lines
	stats[1] = stats[1]/lines
	
	stats = pd.DataFrame([{'acuracy': stats[0],'acuracy_ph': stats[1],}])
	return pd.DataFrame(rows),stats

file_name = out_file('FinalResults_gpt41nano.xlsx')
clean_dtset,stat = clean()


with pd.ExcelWriter(file_name) as writer:
	clean_dtset.to_excel(writer, sheet_name='Results')
	stat.to_excel(writer, sheet_name='Stats',engine='xlsxwriter')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

new_row = {
       	    'question': row['instruction'],
         	'A': row['option_a'],
         	'B': row['option_b'],
         	'C': row['option_c'],
         	'D': row['option_d'],
         	'correct_answer': row['answer'],
			'llm_answer': dt_a["answer"],
			'question_prphr' : dt_ph["paraphrased_question"],
			'A_prphr' : dt_ph["option_a"],
			'B_prphr' : dt_ph["option_b"],
			'C_prphr' : dt_ph["option_c"],
			'D_prphr' : dt_ph["option_d"],
			'llm_answer_prphr': dt_ph_a["answer"],
        }