In [1]:
import os
from dotenv import load_dotenv
import json

load_dotenv()

SSL_CERT_FILE2 = os.getenv('SSL_CERT_FILE2', None)
os.environ['SSL_CERT_FILE'] = SSL_CERT_FILE2
AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
OPENAI_API_VERSION = os.getenv('OPENAI_API_VERSION')
MODELS_STR = os.getenv('MODELS')
MODELS = json.loads(MODELS_STR)


os.environ["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = AZURE_OPENAI_ENDPOINT
os.environ["OPENAI_API_VERSION"] = OPENAI_API_VERSION

## OpenAI answer

In [3]:
from prompts.prompt_for_inference import prompt_for_inference
from utils.datasets_splits import load_dataset_splits
import openai
import csv
import os
from datasets import Dataset
from tqdm import tqdm


In [4]:
client = openai.AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=OPENAI_API_VERSION
)

In [8]:
import time
import openai
import os
import csv
from datasets import Dataset
from tqdm import tqdm
import re

def answer_batch(dataset_name, dataset: Dataset, CSV_FILE, max_tokens=3):
    ids_existentes = set()
    ultimo_id = 0

    # Verificar si ya existe el archivo
    file_exists = os.path.exists(CSV_FILE)

    # Si existe, cargar los IDs existentes
    if file_exists:
        with open(CSV_FILE, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                ids_existentes.add(int(row["id"]))
                ultimo_id = max(ultimo_id, int(row["id"]))

    # Crear el directorio si no existe
    os.makedirs(os.path.dirname(CSV_FILE), exist_ok=True)

    for i in tqdm(range(len(dataset)), desc="Generating responses"):
        if i + 1 in ids_existentes:
            continue  # Ya procesado

        final_prompt = prompt_for_inference(dataset_name, dataset[i])

        # Intentar generar respuesta y manejar el RateLimitError
        while True:
            try:
                chat_completion = client.chat.completions.create(
                    model=MODELS["gpt-3.5-turbo"],
                    messages=[{"role": "user", "content": final_prompt}],
                    max_tokens=max_tokens
                )
                respuesta = chat_completion.choices[0].message.content.strip()
                break  # Si la solicitud fue exitosa, salimos del bucle

            except openai.RateLimitError as e:
                # Intentar extraer "Try again in X seconds" del mensaje
                try:
                    match = re.search(r'"retryAfter":\s*"(\d+)"', str(e))
                    retry_after = int(match.group(1)) if match else 40  # fallback a 30s si no se encuentra
                except Exception:
                    retry_after = 40

                print(f"Rate limit exceeded. Retrying in {retry_after} seconds...")
                time.sleep(retry_after)
            
            except openai.BadRequestError as e:
                print(f"Prompt blocked by content filter at example {i+1}. Skipping.")
                respuesta = "[FILTERED]"
                break  # salir del bucle, marcar como procesado con "[FILTERED]"


        nuevo_id = ultimo_id + 1

        # Guardar inmediatamente
        with open(CSV_FILE, "a", newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=["id", "inference"])
            if not file_exists:
                writer.writeheader()
                file_exists = True
            writer.writerow({
                "id": nuevo_id,
                "inference": respuesta
            })

        ultimo_id = nuevo_id


In [9]:
datasets = {'teleqna':40, 'boolq':3, 'clapnq':60, 'covid':120}
for dataset_name in datasets:
    print(f'Processing {dataset_name}')
    CSV_FILE = f"../results/{dataset_name}/{dataset_name}_openai.csv"
    train_ds, val_ds, test_ds = load_dataset_splits(dataset_name)
    if dataset_name=='clapnq':
        test_ds = test_ds.rename_column("input", "question")
    print('answering questions')
    answer_batch(dataset_name, test_ds, CSV_FILE, datasets[dataset_name])
    print('finish')

Processing teleqna
Loading dataset splits for teleqna
Train: 724
Val: 181
Test: 905
Datasets loaded and prepared.
answering questions


Generating responses: 100%|██████████| 905/905 [00:00<00:00, 590425.43it/s]

finish
Processing boolq
Loading dataset splits for boolq
Loading and preparing datasets...





Train: 7541
Val: 1886
Test: 3270
Datasets loaded and prepared.
answering questions


Generating responses:  16%|█▌        | 507/3270 [00:00<00:05, 517.23it/s]

Prompt blocked by content filter at example 507. Skipping.


Generating responses:  17%|█▋        | 551/3270 [00:15<01:44, 25.97it/s] 

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  19%|█▉        | 624/3270 [01:21<14:21,  3.07it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  21%|██        | 683/3270 [02:22<14:31,  2.97it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  23%|██▎       | 743/3270 [03:24<13:48,  3.05it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  25%|██▍       | 803/3270 [04:26<13:40,  3.01it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  26%|██▋       | 863/3270 [05:28<13:06,  3.06it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  28%|██▊       | 923/3270 [06:30<13:07,  2.98it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  30%|███       | 983/3270 [07:33<13:17,  2.87it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  32%|███▏      | 1043/3270 [08:36<12:41,  2.93it/s] 

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  34%|███▎      | 1103/3270 [09:39<11:57,  3.02it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  36%|███▌      | 1163/3270 [10:41<11:26,  3.07it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  37%|███▋      | 1223/3270 [11:43<11:24,  2.99it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  39%|███▉      | 1283/3270 [12:45<10:55,  3.03it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  41%|████      | 1343/3270 [13:47<10:51,  2.96it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  43%|████▎     | 1403/3270 [14:49<10:12,  3.05it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  45%|████▍     | 1462/3270 [15:50<09:53,  3.05it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  47%|████▋     | 1522/3270 [16:52<09:50,  2.96it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  48%|████▊     | 1581/3270 [17:53<09:07,  3.08it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  50%|█████     | 1641/3270 [18:56<09:03,  3.00it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  52%|█████▏    | 1700/3270 [19:57<09:05,  2.88it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  54%|█████▍    | 1760/3270 [20:59<08:23,  3.00it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  56%|█████▌    | 1820/3270 [22:01<08:07,  2.98it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  57%|█████▋    | 1880/3270 [23:03<07:46,  2.98it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  59%|█████▉    | 1940/3270 [24:05<08:00,  2.77it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  61%|██████    | 2000/3270 [25:07<06:56,  3.05it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  63%|██████▎   | 2060/3270 [26:10<06:35,  3.06it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  65%|██████▍   | 2120/3270 [27:12<06:26,  2.97it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  67%|██████▋   | 2180/3270 [28:14<05:58,  3.04it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  68%|██████▊   | 2239/3270 [29:15<05:46,  2.97it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  70%|███████   | 2298/3270 [30:16<05:17,  3.06it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  72%|███████▏  | 2358/3270 [31:19<05:07,  2.97it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  74%|███████▍  | 2418/3270 [32:22<05:26,  2.61it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  76%|███████▌  | 2478/3270 [33:24<04:34,  2.88it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  78%|███████▊  | 2538/3270 [34:27<04:05,  2.98it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  79%|███████▉  | 2597/3270 [35:28<03:42,  3.03it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  81%|████████▏ | 2657/3270 [36:31<03:28,  2.93it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  83%|████████▎ | 2717/3270 [37:35<03:04,  3.00it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  83%|████████▎ | 2729/3270 [38:21<05:06,  1.76it/s]  

Prompt blocked by content filter at example 2729. Skipping.


Generating responses:  85%|████████▍ | 2777/3270 [38:37<02:44,  2.99it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  87%|████████▋ | 2837/3270 [39:39<02:21,  3.05it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  87%|████████▋ | 2843/3270 [40:23<17:21,  2.44s/it]  

Prompt blocked by content filter at example 2843. Skipping.


Generating responses:  89%|████████▊ | 2897/3270 [40:41<02:00,  3.09it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  90%|█████████ | 2956/3270 [41:43<01:43,  3.03it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  92%|█████████▏| 3015/3270 [42:44<01:25,  2.99it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  94%|█████████▍| 3075/3270 [43:46<01:04,  3.01it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  96%|█████████▌| 3135/3270 [44:49<00:45,  3.00it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  98%|█████████▊| 3195/3270 [45:51<00:25,  2.99it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses: 100%|█████████▉| 3255/3270 [46:53<00:05,  2.96it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses: 100%|██████████| 3270/3270 [47:40<00:00,  1.14it/s]


finish
Processing clapnq
Loading dataset splits for clapnq


README.md:   0%|          | 0.00/653 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


clapnq_train_answerable.jsonl:   0%|          | 0.00/6.63M [00:00<?, ?B/s]

clapnq_train_unanswerable.jsonl:   0%|          | 0.00/4.64M [00:00<?, ?B/s]

clapnq_dev_answerable.jsonl:   0%|          | 0.00/986k [00:00<?, ?B/s]

clapnq_dev_unanswerable.jsonl:   0%|          | 0.00/790k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3745 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/600 [00:00<?, ? examples/s]

Train: 2996
Val: 749
Test: 600
Datasets loaded and prepared.
answering questions


Generating responses:   8%|▊         | 45/600 [00:22<03:53,  2.38it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  18%|█▊        | 105/600 [01:34<03:50,  2.15it/s] 

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  28%|██▊       | 165/600 [02:42<03:06,  2.34it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  38%|███▊      | 225/600 [03:51<02:50,  2.20it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  48%|████▊     | 285/600 [05:03<02:21,  2.22it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  57%|█████▊    | 345/600 [06:12<01:38,  2.60it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  68%|██████▊   | 405/600 [07:18<01:18,  2.47it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  70%|███████   | 420/600 [08:06<01:24,  2.14it/s]

Prompt blocked by content filter at example 420. Skipping.


Generating responses:  78%|███████▊  | 465/600 [08:24<00:54,  2.49it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  88%|████████▊ | 525/600 [09:32<00:28,  2.62it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  98%|█████████▊| 585/600 [10:37<00:07,  2.01it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses: 100%|██████████| 600/600 [11:25<00:00,  1.14s/it]


finish
Processing covid
Loading dataset splits for covid


README.md:   0%|          | 0.00/5.71k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2019 [00:00<?, ? examples/s]

Train: 1292
Val: 323
Test: 404
Datasets loaded and prepared.
answering questions


Generating responses:  11%|█         | 45/404 [00:31<03:34,  1.67it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  26%|██▌       | 105/404 [01:56<03:43,  1.34it/s] 

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  41%|████      | 165/404 [03:21<02:25,  1.65it/s]  

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  56%|█████▌    | 225/404 [04:43<02:55,  1.02it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  71%|███████   | 285/404 [06:07<01:23,  1.42it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses:  85%|████████▌ | 345/404 [07:27<00:39,  1.51it/s]

Rate limit exceeded. Retrying in 40 seconds...


Generating responses: 100%|██████████| 404/404 [08:53<00:00,  1.32s/it]

finish





In [11]:
from utils.evaluate_inference import evaluate_answer
evaluate_answer('boolq', '../results/boolq/boolq_openai.csv')

Loading dataset splits for boolq
Loading and preparing datasets...
Train: 7541
Val: 1886
Test: 3270
Datasets loaded and prepared.
Results saved in: ../results/boolq/boolq_openai.csv


In [12]:
from utils.evaluate_inference import evaluate_answer
evaluate_answer('teleqna', '../results/teleqna/teleqna_openai.csv')

Loading dataset splits for teleqna
Train: 724
Val: 181
Test: 905
Datasets loaded and prepared.


Map:   0%|          | 0/905 [00:00<?, ? examples/s]

Results saved in: ../results/teleqna/teleqna_openai.csv


In [13]:
from utils.evaluate_inference import evaluate_answer
evaluate_answer('covid', '../results/covid/covid_openai.csv')

Loading dataset splits for covid
Train: 1292
Val: 323
Test: 404
Datasets loaded and prepared.


Map:   0%|          | 0/404 [00:00<?, ? examples/s]

Results saved in: ../results/covid/covid_openai.csv


In [14]:
from utils.evaluate_inference import evaluate_answer
evaluate_answer('clapnq', '../results/clapnq/clapnq_openai.csv')

Loading dataset splits for clapnq
Train: 2996
Val: 749
Test: 600
Datasets loaded and prepared.


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Results saved in: ../results/clapnq/clapnq_openai.csv
