In [1]:
import pandas as pd
import json
import os
from openai import OpenAI #estamos la clase concreta OpenAI del módulo openai
from dotenv import load_dotenv #importamos una función concreta del módulo
load_dotenv("template.env")

True

In [2]:
# Acceder a la clave de API de OpenAI
api_key = os.getenv("OPENAI_API_KEY")

# Asegurarte de que la clave de API se haya cargado correctamente
if api_key is None:
    raise ValueError("La clave de API no está configurada en el archivo .env")
    
client = OpenAI() #creando un objeto de la clase

In [3]:
dataset_folder = os.getenv("DATASET_FOLDER")
dataset_path = str(dataset_folder) + "GPT_estimates_AoA_v1.xlsx"

df = pd.read_excel(dataset_path)
#df = df.sample(30)
df.head()

Unnamed: 0,Word,AoA,Source
0,¡aba!,,Hinojosa et al. (2021)
1,¡abur!,,Hinojosa et al. (2021)
2,¡achís!,,Hinojosa et al. (2021)
3,¡adiós!,,Hinojosa et al. (2021)
4,¡aghgghh!,,Hinojosa et al. (2021)


In [4]:
#FUNCTION DECLARATION

def extract_data(new_line):
	res = new_line["response"]["body"]["choices"][0]["message"]["content"]
	res = json.loads(res)
	return res

def create_file_from_tasks(tasks,file_name):
	with open(file_name, 'w') as file:
		for obj in tasks:
			file.write(json.dumps(obj) + '\n')


def create_batch(file_name):
	batch_file = client.files.create(
		file = open(file_name, "rb"),
		purpose = "batch"
	)
	batch_job = client.batches.create(
		input_file_id = batch_file.id,
		endpoint = "/v1/chat/completions",
		completion_window = "24h"
	)
	return batch_job

def get_line_file(file_name,line,extract_func):
	with open(file_name, 'r') as f:
		for line_number, theline in enumerate(f):
			if line_number == line:
				res = theline
				break
	res = json.loads(res)
	return extract_func(res)

def generate_task(index,prompt,desc):
	task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # Esto es lo que tendrías en tu llamada a la API de Chat Completions
            "model": "gpt-4o-mini",
            "temperature": 0,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": desc
                }
            ],
        }
    }
	return task

def create_task_from_json(json_object,index,prompt):
	desc = json.dumps({"palabra":json_object["Word"]})
	task = generate_task(
		index, prompt,desc,
	)
	return task

def create_task_array_from_dataframe(df,prompt):
	tasks = []
	for index, row in df.iterrows():
		task = create_task_from_json(row,index,prompt)
		tasks.append(task)
	return tasks


In [5]:
#PROMPTS

#AGE PROMPT
categorize_system_prompt_paraphrase ='''
La edad de adquisición (AoA) de una palabra se refiere a la edad en la que se aprendió una palabra por primera vez. 
En concreto, cuándo una persona habría entendido por primera vez esa palabra si alguien la hubiera utilizado delante de ella, incluso cuando aún no la hubiera dicho, leído o escrito. 
Calcule la edad media de adquisición (AoA) de la palabra {palabra} para un hablante nativo de español.

El formato de salida debe ser un objeto JSON: {AoA: número //AoA de la palabra expresado en años, puede incluir decimales, Word: palabra //string}
'''

In [6]:
#Set output folder
output_folder = os.getenv("OUTPUT_FOLDER")
def out_file(file_name): return (str(output_folder) + file_name)

In [7]:
#AGE TASK
tasks_array = [create_task_array_from_dataframe(df,categorize_system_prompt_paraphrase)]
file_array = [out_file("batch_job_mmlu_age.jsonl")]

In [8]:
#DIVIDE TASK
def divide_task(tasks_array,file_array,task_index,num_tasks):
	res_task_array = []
	res_file_array = []
	task_array_to_div = []
	prov_file_name = ""
	for i in range(0,len(tasks_array)):
		if(i == task_index):
			task_array_to_div = tasks_array[i]
			prov_file_name = file_array[i]
		else:
			res_task_array.append(tasks_array[i])
			res_file_array.append(file_array[i])
	index = 1+int(len(task_array_to_div)/num_tasks)
	for i in range(0,index):
		tasks = []
		for j in range(0,num_tasks):
			if(i*num_tasks+j < len(task_array_to_div)):
				tasks.append(task_array_to_div[i*num_tasks+j])
		res_task_array.append(tasks)
		res_file_array.append(prov_file_name.replace(".json","_"+"0"*(1+int(index/10)-len(str(i)))+str(i)+".json"))
	return res_task_array,res_file_array

tasks_array,file_array = divide_task(tasks_array,file_array,0,10000)

for i in range(0,len(file_array)):
	print(file_array[i])

output_files/batch_job_mmlu_age_00.jsonl
output_files/batch_job_mmlu_age_01.jsonl
output_files/batch_job_mmlu_age_02.jsonl
output_files/batch_job_mmlu_age_03.jsonl
output_files/batch_job_mmlu_age_04.jsonl
output_files/batch_job_mmlu_age_05.jsonl
output_files/batch_job_mmlu_age_06.jsonl
output_files/batch_job_mmlu_age_07.jsonl
output_files/batch_job_mmlu_age_08.jsonl
output_files/batch_job_mmlu_age_09.jsonl
output_files/batch_job_mmlu_age_10.jsonl
output_files/batch_job_mmlu_age_11.jsonl
output_files/batch_job_mmlu_age_12.jsonl


In [9]:
#GENERATE TASK FILES
for i in range(0,len(tasks_array)):
	create_file_from_tasks(tasks_array[i],file_array[i])

In [10]:
#GENERATE BATCH
batch_jobs = []
for i in range(0,len(tasks_array)):
	ba_jo= create_batch(file_array[i])
	batch_jobs.append(ba_jo)

In [20]:
#COMPLETION_CHECK
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	print(batch)
	result_file_id = batch.output_file_id
	print(batch.status)

Batch(id='batch_67c47e3323808190ab137056e6d590f4', completion_window='24h', created_at=1740930611, endpoint='/v1/chat/completions', input_file_id='file-GJE3g9qXCw3wABCdoECuN8', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740933863, error_file_id=None, errors=None, expired_at=None, expires_at=1741017011, failed_at=None, finalizing_at=1740933160, in_progress_at=1740930613, metadata=None, output_file_id='file-UTe479b4mThcmedgcjiHga', request_counts=BatchRequestCounts(completed=10000, failed=0, total=10000))
completed
Batch(id='batch_67c47e3e65d88190ba782c4149adc15e', completion_window='24h', created_at=1740930622, endpoint='/v1/chat/completions', input_file_id='file-AGcyCJqg4tWR8pBhsMUvrX', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740939232, error_file_id=None, errors=None, expired_at=None, expires_at=1741017022, failed_at=None, finalizing_at=1740938100, in_progress_at=1740930625, metadata=None, o

In [21]:
#OUTPUT FILES GENERATOR
for i in range(0,len(batch_jobs)):
	batch = batch_jobs[i]
	batch = client.batches.retrieve(batch.id)
	result_file_id = batch.output_file_id

	result = client.files.content(result_file_id).content

	result_file_name = file_array[i].replace(".json","_result.json")

	with open(result_file_name, 'wb') as file:
		file.write(result)

In [22]:
#Cleaning function
def clean(file_array):
	rows = []
	errors = []
	acum = 0

	for i in range(0,len(file_array)):
		f_a = file_array[i].replace(".json","_result.json")
		with open(f_a, 'r') as f:
			for line in f:
				row = df.iloc[acum]
				try:
					dt_a = extract_data(json.loads(line.strip()))
					rows.append({
     	    			"Word":row['Word'],
						 "AoA":dt_a["AoA"],
						"Source":row['Source']
					})
				except:
					rows.append({
						"Word":row['Word'],
     	    			"AoA":"NaN",
						"Source":row['Source']
					})
					print(f"file_num {i}\nline_num {acum}\nline {line}")
					errors.append({
						#"File_Num":i,
     	    			"Line_Num":acum,
						#"Line":line
					})
				acum += 1
       	    
	return pd.DataFrame(rows), pd.DataFrame(errors)

file_name = out_file("FinalResults.xlsx")
clean_dtset,errors_dtset = clean(file_array)


with pd.ExcelWriter(file_name) as writer:
	clean_dtset.to_excel(writer, sheet_name='Results',index=False)
	errors_dtset.to_excel(writer, sheet_name='Errors',index=False)

file_num 7
line_num 75848
line {"id": "batch_req_67c490ec34ec81909107fea37f2c588f", "custom_id": "task-75848", "response": {"status_code": 200, "request_id": "72a59adcce863ce1a9b14fb96c92b9e5", "body": {"id": "chatcmpl-B6g408ngFUjZD87HYIBb1yG0vRllW", "object": "chat.completion", "created": 1740931360, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\"AoA\": 10.5, \"Word\": \"licitamente", "refusal": null}, "logprobs": null, "finish_reason": "length"}], "usage": {"prompt_tokens": 153, "completion_tokens": 15, "total_tokens": 168, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_prediction_tokens": 0}}, "service_tier": "default", "system_fingerprint": "fp_06737a9306"}}, "error": null}

