In [None]:
import openpyxl
import pandas as pd
import json
#import requests
import os
import time
import re
import openai
from concurrent.futures import ThreadPoolExecutor

In [None]:
from openai import OpenAI #estamos la clase concreta OpenAI del módulo openai
from dotenv import load_dotenv #importamos una función concreta del módulo
import os

load_dotenv("template.env")

# Acceder a la clave de API de OpenAI
api_key = os.getenv("OPENAI_API_KEY")

# Asegurarte de que la clave de API se haya cargado correctamente
if api_key is None:
    raise ValueError("La clave de API no está configurada en el archivo .env")
    
client = OpenAI() #creando un objeto de la clase

In [None]:
dataset_path = "MMLU_completo.xlsx"

df = pd.read_excel(dataset_path)
#df = df.sample(20) #QUITAR ESTA LÍNEA PARA PROCESAR TODO EL DATASET
df = df.sample(5) 
df.head()

In [None]:
categorize_system_prompt = '''
Your goal is to evaluate and paraphrase multiple choice questions from JSON objects. You will be provided with a JSON object containing a multiple choice question, and you will output two JSON objects: one with the evaluation of the original question and one with the paraphrased question and its evaluation.

For each row, follow these steps:
1. Evaluate the original question without paraphrasing.
2. Paraphrase the question while maintaining the original meaning, ensuring the paraphrase is at least 500 characters long.
3. Forget what you have done so far and exclusively answer the paraphrased question. The answer could be different from the original question, just answer what you think it is.

The JSON objects should be in the following format:

Original Evaluation:
{
    answer: string // The selected option key for the original question, limited to 'A', 'B', 'C', or 'D'
}

Paraphrased Evaluation:
{
    paraphrased_question: string // The paraphrased question
    answer: string // The selected option key for the paraphrased question, limited to 'A', 'B', 'C', or 'D'
}

Ensure that the question is presented differently but conveys the same idea. Keep the JSON format in the answer with '{' and '}'.
'''

In [None]:
tasks = []

for index, row in df.iterrows():
    
    # Extraer las columnas necesarias del DataFrame
    instruction = row['instruction']
    option_a = row['option_a']
    option_b = row['option_b']
    option_c = row['option_c']
    option_d = row['option_d']
    
    # Formar el JSON con las columnas extraídas
    description = json.dumps({
        "instruction": instruction,
        "options": {
            "A": option_a,
            "B": option_b,
            "C": option_c,
            "D": option_d
        }
    })
    
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # Esto es lo que tendrías en tu llamada a la API de Chat Completions
            "model": "gpt-4o-mini",
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": description
                }
            ],
        }
    }
    
    tasks.append(task)

In [None]:
# Creating the file

file_name = "batch_tasks_mmlu.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [None]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

In [None]:
print(batch_file)

In [None]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [None]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

In [None]:
#PARA CANCELAR UN BATCH
# # ID del batch que deseas cancelar
# batch_id = batch_job.id

# # URL para cancelar el batch
# url = f"https://api.openai.com/v1/batches/{batch_id}/cancel"

# # Encabezados de la solicitud
# headers = {
#     "Authorization": f"Bearer {api_key}",
#     "Content-Type": "application/json",
# }

# # Realiza la solicitud POST para cancelar el batch
# response = requests.post(url, headers=headers)

# # Maneja la respuesta
# if response.status_code == 200:
#     data = response.json()
#     print("Batch cancelado exitosamente.")
#     print("Detalles del Batch:")
#     print(data)
# else:
#     print(f"Error al cancelar el batch: {response.status_code}")
#     print(response.text)

In [None]:
# PARA EXTRAER LOS RESULTADOS
result_file_id = batch_job.output_file_id
print(batch_job.status)

In [None]:
result = client.files.content(result_file_id).content

In [None]:
result_file_name = "batch_job_results_mmlu.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)