In [None]:
import openpyxl
import pandas as pd
import json
import os
import time
import re
import openai
from concurrent.futures import ThreadPoolExecutor

In [None]:
from openai import OpenAI #estamos la clase concreta OpenAI del módulo openai
from dotenv import load_dotenv #importamos una función concreta del módulo
import os

load_dotenv("template.env")

# Acceder a la clave de API de OpenAI
api_key = os.getenv("OPENAI_API_KEY")

# Asegurarte de que la clave de API se haya cargado correctamente
if api_key is None:
    raise ValueError("La clave de API no está configurada en el archivo .env")
    
client = OpenAI() #creando un objeto de la clase

In [None]:
dataset_path = "MMLU_completo.xlsx"

df = pd.read_excel(dataset_path)
#df = df.sample(20) #QUITAR ESTA LÍNEA PARA PROCESAR TODO EL DATASET
df = df.sample(5) 
df.head()

#for i in range(0,5):
#	print(df.values[i])

In [None]:
def generate_description(instr,op_a,op_b,op_c,op_d):
	description = json.dumps({
        "instruction": instr,
        "options": {
            "A": op_a,
            "B": op_b,
            "C": op_c,
            "D": op_d
        }
    })
	return description

def generate_task(index,prompt,desc):
	task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # Esto es lo que tendrías en tu llamada a la API de Chat Completions
            "model": "gpt-4o-mini",
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": desc
                }
            ],
        }
    }
	return task

In [None]:
def jsonfile_to_dataframe(jsonl_file):
    with open(jsonl_file, 'r') as file:
        instruc = []
        ops_a = []
        ops_b = []
        ops_c = []
        ops_d = []
    
        for line in file:
            new_line = json.loads(line)
            new_line = new_line["response"]["body"]["choices"][0]["message"]["content"]
            new_line = json.loads(new_line)
        
            instruc.append(new_line["paraphrased_question"])
            ops_a.append(new_line["option_a"])
            ops_b.append(new_line["option_b"])
            ops_c.append(new_line["option_c"])
            ops_d.append(new_line["option_d"])
    
        data = {
            "instruction":instruc,
            "option_a":ops_a,
            "option_b":ops_b,
            "option_c":ops_c,
            "option_d":ops_d,
    	}

    return (pd.DataFrame(data))

In [None]:
def create_task_array(df,prompt):
    tasks = []

    for index, row in df.iterrows():
    
        # Extraer las columnas necesarias del DataFrame
        instruction = row['instruction']
        option_a = row['option_a']
        option_b = row['option_b']
        option_c = row['option_c']
        option_d = row['option_d']
    
        description = generate_description(
            instruction, option_a, option_b, option_c, option_d,
	    )
    
        task = generate_task(
            index, prompt,description,
	    )
    
        tasks.append(task)
    return tasks

In [None]:
categorize_system_prompt_paraphrase ='''
Your goal is to paraphrase multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the reworded question.

For each row paraphrase the question while maintaining the original meaning and paraphrase the options as well. Ensure the paraphrase is at least 500 characters long between options and question.

The output JSON objects should be in the following format:

{paraphrased_question: string, option_a: string, option_b: string, option_c: string, option_d: string,}

Ensure that the question is presented differently but conveys the same idea. Keep the JSON format in the output with '{' and '}'.
'''

In [None]:
categorize_system_prompt_answer = '''
Your goal is to evaluate multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question and you will output a JSON object with the evaluation of the question.

For each row evaluate the question.

The output JSON objects should be in the following format:

{answer: string, // The selected option key for the question, limited to 'A', 'B', 'C', or 'D'}

Keep the JSON format in the answer with '{' and '}'.
'''

In [None]:
def create_file_from_tasks(tasks,file_name):
	with open(file_name, 'w') as file:
		for obj in tasks:
			file.write(json.dumps(obj) + '\n')
	

In [None]:
def create_batch(file_name):
	batch_file = client.files.create(
		file=open(file_name, "rb"),
		purpose="batch"
	)
	batch_job = client.batches.create(
		input_file_id=batch_file.id,
		endpoint="/v1/chat/completions",
		completion_window="24h"
	)
	return batch_job

In [None]:
tasks_array = [create_task_array(df,categorize_system_prompt_paraphrase)]
file_array = ["batch_job_mmlu_paraphrase.jsonl"]

batch_jobs = []
for i in range(0,len(tasks_array)):
	create_file_from_tasks(tasks_array[i],file_array[i])
	ba_jo= create_batch(file_array[i])
	batch_jobs.append(ba_jo)

In [None]:
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	print(batch)
	result_file_id = batch.output_file_id
	print(batch.status)

In [None]:
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	result_file_id = batch.output_file_id

	result = client.files.content(result_file_id).content

	result_file_name = file_array[i].replace(".json","_result.json")

	with open(result_file_name, 'wb') as file:
		file.write(result)