In [1]:
# !pip install openai tiktoken

In [2]:
from dotenv import load_dotenv; load_dotenv()
import os
import uuid
import pandas as pd
import openai
import tiktoken
import time
import json

DATA_PATH = os.path.join(os.getenv('DATA_PATH'), 'Gingado')
OUTPUT_PATH = os.path.join(os.getenv('OUTPUT_PATH'), 'datasets')
TMP_PATH = os.path.join(os.getenv('OUTPUT_PATH'), 'tmp')
MODEL = "gpt-5-mini" # "gpt-4.1-nano", "gpt-4.1-mini"

In [3]:
client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
encoding = tiktoken.get_encoding("cl100k_base")

In [4]:
response_format={
    "type": "json_schema",
    "json_schema": {
        "name": "speech_information_extraction",
        "schema": {
            "type": "object",
            "properties": {
                "author": {
                    "type": "string",
                    "description": "First and last name of the author of the speech (None if not provided)."
                },
                "organization": {
                    "type": "string",
                    "description": "The organization the author is affiliated with (try to guess if not explicit).",
                },
                "country_code": {
                    "type": "string",
                    "description": "The ISO 3166-1 alpha-2 country code of the organization (try to guess if not explicit).",
                },
                "sentiment": {
                    "type": "string",
                    "description": "Overall entiment of the speech regarding macroeconomy.",
                    "enum": ["hawkish", "dovish", "neutral"]
                }
            },
            "required": ["author", "organization", "country_code", "sentiment"],
            "additionalProperties": False
        },
        "strict": True
    }
}

response_format_tokens = len(encoding.encode(json.dumps(response_format)))
print(f"Response format tokens: {response_format_tokens}")

Response format tokens: 197


In [5]:
def create_prompt(date, author, title, description) -> str:
    return f"""Extract information for the following speech metadata:
Date: '{date}'
Author: '{author}'
Title: '{title}'
Description: '{description}'

Your response:
"""

In [14]:
speeches = pd.read_parquet(os.path.join(DATA_PATH, 'gingado-cb-speeches_19960910-20251225.parquet'))
speeches.reset_index(inplace=True)
speeches["uuid"] = speeches.apply(lambda row: f"{row['date'].strftime('%Y%m%d')}-{str(uuid.uuid5(uuid.NAMESPACE_DNS, row['title']))[:8]}", axis=1)
speeches = speeches.drop_duplicates(subset=["uuid"]).sort_values(by="date").reset_index(drop=True)

In [None]:
jsonl_file_path = os.path.join(TMP_PATH, "gigando_speeches_ner_requests.jsonl")
with open(jsonl_file_path, "w") as f:
    pass

total_token_count = 0
with open(jsonl_file_path, "w") as f:
    for idx, row in speeches.iterrows():
        speech_id = row["uuid"]
        date = row['date'].strftime('%Y-%m-%d')
        author = row['author']
        title = row['title']
        description = row['description']
        transcript = row['text']
        prompt = create_prompt(date, author, title, description)
        total_token_count += len(encoding.encode(prompt)) + response_format_tokens
        input = {
            "custom_id": speech_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": MODEL,
                "messages": [
                    {"role": "system", "content": "You are an expert in natural language processing and financial analysis. Your task is to extract key information including the author's name, his/her organization, the organization country code, and the overall macroeconomic sentiment. Restrict your knowledge to what was available up to the date of the provided speech date."},
                    {"role": "user", "content": prompt}
                ],
                "response_format": response_format,
                # "temperature": 0.0
            }
        }
        json.dump(input, f)
        if idx < len(speeches) - 1:
            f.write("\n")

print(f"Total token count for all requests: {total_token_count} (average per request: {total_token_count / speeches.shape[0]:.2f})")

Total token count for all requests: 5989788 (average per request: 294.82)


In [8]:
assert speeches.shape[0] < 50000, "Number of requests exceeds OpenAI's 50,000 limit."
assert os.path.getsize(jsonl_file_path) < 200 * 1024 * 1024, "File size exceeds OpenAI's 200MB limit."

In [9]:
batch_input_file = client.files.create(
    file=open(jsonl_file_path, "rb"),
    purpose="batch"
)
print(batch_input_file)

FileObject(id='file-T19odZZX67A6Bep1HNQtdr', bytes=35451822, created_at=1768603152, filename='gigando_speeches_ner_requests.jsonl', object='file', purpose='batch', status='processed', expires_at=1771195152, status_details=None)


In [32]:
batch_input_file_id = batch_input_file.id
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "NER extraction for Gigando CB speeches dataset",
        "model": MODEL
    }
)
print(batch.id)

batch_695cd41a19008190ad14b3ffd9046615


In [33]:
# client.batches.cancel(batch.id)

In [34]:
wait_time = 15 # seconds
status = client.batches.retrieve(batch.id).status
while status not in ["failed", "completed", "expired", "cancelled"]:
    batch_status = client.batches.retrieve(batch.id)
    status = batch_status.status
    print(f"Batch status: {status} --- Nb completed: {batch_status.request_counts.completed}/{batch_status.request_counts.total}")
    time.sleep(wait_time)

if batch_status.status == "failed":
    print([err.message for err in batch_status.errors.data])

Batch status: validating --- Nb completed: 0/0
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 495/20069
Batch status: in_progress --- Nb completed: 495/20069
Batch status: in_progress --- Nb completed: 838/20069
Batch status: in_progress --- Nb completed: 933/20069
Batch status: in_progress --- Nb completed: 933/20069
Batch status: in_progress --- Nb completed: 1359/20069
Batch status: in_progress --- Nb completed: 1359/20069
B

In [None]:
if batch_status.output_file_id is not None:
    print("Successfully completed batch. Retrieving results...")
    file_response = client.files.content(batch_status.output_file_id)
    raw_response_lines = file_response.text.strip().split("\n")
    responses = [json.loads(line) for line in raw_response_lines]
    pd.DataFrame(responses).to_parquet("gigando_speeches_ner_responses.parquet")
    results = pd.DataFrame([json.loads(resp['response']['body']['choices'][0]['message']['content']) for resp in responses])
    results["id"] = [resp['custom_id'] for resp in responses]
    print("Done!!!")
else:
    print("No output file ID found.")
    if batch_status.error_file_id is not None:
        print("Retrieving error file...")
        error_content = client.files.content(batch_status.error_file_id)
        print(error_content.text.strip().split("\n"))

Successfully completed batch. Retrieving results...
Done!!!


In [37]:
assert results.shape[0] == speeches.shape[0], "Number of results does not match number of speeches."
speeches = speeches.merge(results, left_on="uuid", right_on="id", how="inner")

In [None]:
speeches.to_parquet(os.path.join(OUTPUT_PATH, 'gingado-cb-speeches_19960910-20251225_NER.parquet'))