In [None]:
import os
import uuid
import pandas as pd
import openai
import tiktoken
import time
import json

MODEL = "gpt-5-mini" # "gpt-4.1-nano", "gpt-4.1-mini"

In [None]:
client = openai.Client(api_key=os.getenv("OPEN_AI_API"))
encoding = tiktoken.get_encoding("cl100k_base")

In [None]:
response_format={
    "type": "json_schema",
    "json_schema": {
        "name": "speech_information_extraction",
        "schema": {
            "type": "object",
            "properties": {
                "author": {
                    "type": "string",
                    "description": "First and last name of the author of the speech (None if not provided)."
                },
                "organization": {
                    "type": "string",
                    "description": "The organization the author is affiliated with (try to guess if not explicit).",
                },
                "country_code": {
                    "type": "string",
                    "description": "The ISO 3166-1 alpha-2 country code of the organization (try to guess if not explicit).",
                },
                "sentiment": {
                    "type": "string",
                    "description": "Overall entiment of the speech regarding macroeconomy.",
                    "enum": ["hawkish", "dovish", "neutral"]
                }
            },
            "required": ["author", "organization", "country_code", "sentiment"],
            "additionalProperties": False
        },
        "strict": True
    }
}

response_format_tokens = len(encoding.encode(json.dumps(response_format)))
print(f"Response format tokens: {response_format_tokens}")

Response format tokens: 197


In [None]:
def create_prompt(date, author, title, description) -> str:
    return f"""Extract information for the following speech metadata:
Date: '{date}'
Author: '{author}'
Title: '{title}'
Description: '{description}'

Your response:
"""

In [28]:
speeches = pd.read_csv('gigando-cb-speeches_1996-2025.csv')
speeches["datetime"] = pd.to_datetime(speeches["date"])
speeches["uuid"] = speeches.apply(lambda row: f"{row['datetime'].strftime('%Y%m%d')}-{str(uuid.uuid5(uuid.NAMESPACE_DNS, row['title']))[:8]}", axis=1)
speeches = speeches.drop_duplicates(subset=["uuid"]).sort_values(by="datetime").reset_index(drop=True)

In [None]:
total_token_count = 0
with open("gigando_speeches_ner_requests.jsonl", "w") as f:
    for idx, row in speeches.iterrows():
        speech_id = row["uuid"]
        date = row['date']
        author = row['author']
        title = row['title']
        description = row['description']
        transcript = row['text']
        prompt = create_prompt(date, author, title, description)
        total_token_count += len(encoding.encode(prompt)) + response_format_tokens
        input = {
            "custom_id": speech_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": MODEL,
                "messages": [
                    {"role": "system", "content": "You are an expert in natural language processing and financial analysis. Your task is to extract key information including the author's name, his/her organization, the organization country code, and the overall macroeconomic sentiment. Restrict your knowledge to what was available up to the date of the provided speech date."},
                    {"role": "user", "content": prompt}
                ],
                "response_format": response_format,
                # "temperature": 0.0
            }
        }
        json.dump(input, f)
        if idx < len(speeches) - 1:
            f.write("\n")

print(f"Total token count for all requests: {total_token_count} (average per request: {total_token_count / speeches.shape[0]:.2f})")

Total token count for all requests: 6036468 (average per request: 300.79)


In [30]:
assert speeches.shape[0] < 50000, "Number of requests exceeds OpenAI's 50,000 limit."
assert os.path.getsize("gigando_speeches_ner_requests.jsonl") < 200 * 1024 * 1024, "File size exceeds OpenAI's 200MB limit."

In [31]:
batch_input_file = client.files.create(
    file=open("gigando_speeches_ner_requests.jsonl", "rb"),
    purpose="batch"
)
print(batch_input_file)

FileObject(id='file-6aRpsZoM7MXAp2Lo7W434J', bytes=35196700, created_at=1767691288, filename='gigando_speeches_ner_requests.jsonl', object='file', purpose='batch', status='processed', expires_at=1770283288, status_details=None)


In [32]:
batch_input_file_id = batch_input_file.id
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "NER extraction for Gigando CB speeches dataset",
        "model": MODEL
    }
)
print(batch.id)

batch_695cd41a19008190ad14b3ffd9046615


In [33]:
# client.batches.cancel(batch.id)

In [34]:
wait_time = 15 # seconds
status = client.batches.retrieve(batch.id).status
while status not in ["failed", "completed", "expired", "cancelled"]:
    batch_status = client.batches.retrieve(batch.id)
    status = batch_status.status
    print(f"Batch status: {status} --- Nb completed: {batch_status.request_counts.completed}/{batch_status.request_counts.total}")
    time.sleep(wait_time)

if batch_status.status == "failed":
    print([err.message for err in batch_status.errors.data])

Batch status: validating --- Nb completed: 0/0
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 0/20069
Batch status: in_progress --- Nb completed: 495/20069
Batch status: in_progress --- Nb completed: 495/20069
Batch status: in_progress --- Nb completed: 838/20069
Batch status: in_progress --- Nb completed: 933/20069
Batch status: in_progress --- Nb completed: 933/20069
Batch status: in_progress --- Nb completed: 1359/20069
Batch status: in_progress --- Nb completed: 1359/20069
B

In [None]:
if batch_status.output_file_id is not None:
    print("Successfully completed batch. Retrieving results...")
    file_response = client.files.content(batch_status.output_file_id)
    raw_response_lines = file_response.text.strip().split("\n")
    responses = [json.loads(line) for line in raw_response_lines]
    pd.DataFrame(responses).to_parquet("gigando_speeches_ner_responses.parquet")
    results = pd.DataFrame([json.loads(resp['response']['body']['choices'][0]['message']['content']) for resp in responses])
    results["id"] = [resp['custom_id'] for resp in responses]
    print("Done!!!")
else:
    print("No output file ID found.")
    if batch_status.error_file_id is not None:
        print("Retrieving error file...")
        error_content = client.files.content(batch_status.error_file_id)
        print(error_content.text.strip().split("\n"))

Successfully completed batch. Retrieving results...
Done!!!


In [36]:
results

Unnamed: 0,author,organization,country_code,sentiment,id
0,Chen Yuan,People's Bank of China,CN,neutral,19960910-bd3f2022
1,Dai Xianglong,People's Bank of China,CN,neutral,19960930-257ff98e
2,Yasuo Matsushita,Bank of Japan,JP,neutral,19961106-a16f3484
3,Dai Xianglong,People's Bank of China,CN,dovish,19961113-191260f2
4,Howard Davies,Bank of England,GB,neutral,19961216-7492cf02
...,...,...,...,...,...
20064,Michelle W Bowman,Board of Governors of the Federal Reserve System,US,hawkish,20250923-dc1062af
20065,Sabine Mauderer,Deutsche Bundesbank,DE,neutral,20250923-a961c5a4
20066,Burkhard Balz,Deutsche Bundesbank,DE,neutral,20250924-eef747dc
20067,Michele Bullock,Reserve Bank of Australia,AU,neutral,20250924-af9d82e8


In [37]:
assert results.shape[0] == speeches.shape[0], "Number of results does not match number of speeches."
speeches = speeches.merge(results, left_on="uuid", right_on="id", how="inner")

In [40]:
speeches

Unnamed: 0,date,url,title,description,text,author_x,datetime,uuid,author_y,organization,country_code,sentiment,id
0,1996-09-10 00:00:00,https://www.bis.org/review/r970211c.pdf,Mr. Chen discusses monetary relations between ...,Speech by the Deputy Governor of the People's ...,Mr. Chen discusses monetary relations bet...,Chen Yuan,1996-09-10 00:00:00,19960910-bd3f2022,Chen Yuan,People's Bank of China,CN,neutral,19960910-bd3f2022
1,1996-09-30 00:00:00,https://www.bis.org/review/r970211a.pdf,Mr. Dai assesses the outlook for Hong Kong as ...,Speech by the Governor of the People's Bank of...,Mr. Dai assesses the outlook for Hong K...,Dai Xianglong,1996-09-30 00:00:00,19960930-257ff98e,Dai Xianglong,People's Bank of China,CN,neutral,19960930-257ff98e
2,1996-11-06 00:00:00,https://www.bis.org/review/r970107a.pdf,Mr. Matsushita considers the role of monetary ...,Translated excerpts of a speech given by the G...,Mr. Matsushita considers the role of monetary ...,Yasuo Matsushita,1996-11-06 00:00:00,19961106-a16f3484,Yasuo Matsushita,Bank of Japan,JP,neutral,19961106-a16f3484
3,1996-11-13 00:00:00,https://www.bis.org/review/r970211b.pdf,Mr. Dai looks at the possibilities of strength...,Speech by the Governor of the People's Bank of...,Mr. Dai looks at the possibilities of s...,Dai Xianglong,1996-11-13 00:00:00,19961113-191260f2,Dai Xianglong,People's Bank of China,CN,dovish,19961113-191260f2
4,1996-12-16 00:00:00,https://www.bis.org/review/r970108c.pdf,Mr. Davies gives his personal view of EMU (Ce...,Speech by the Deputy Governor of the Bank of E...,Mr. Davies gives his personal view of EMU Spee...,Mervyn King,1996-12-16 00:00:00,19961216-7492cf02,Howard Davies,Bank of England,GB,neutral,19961216-7492cf02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20064,2025-09-23 00:00:00,https://www.bis.org/review/r250924f.htm,Michelle W Bowman: Views on the economy and mo...,"Speech by Ms Michelle W Bowman, Vice Chair for...",## Michelle W Bowman: Views on the economy and...,Michelle W Bowman,2025-09-23 00:00:00,20250923-dc1062af,Michelle W Bowman,Board of Governors of the Federal Reserve System,US,hawkish,20250923-dc1062af
20065,2025-09-23 00:00:00,https://www.bis.org/review/r250924j.htm,Sabine Mauderer: Seizing the moment - strength...,"Speech by Dr Sabine Mauderer, First Deputy Gov...",## Sabine Mauderer: Seizing the moment - stren...,Sabine Mauderer,2025-09-23 00:00:00,20250923-a961c5a4,Sabine Mauderer,Deutsche Bundesbank,DE,neutral,20250923-a961c5a4
20066,2025-09-24 00:00:00,https://www.bis.org/review/r250924i.htm,Burkhard Balz: The digital €uro - a game chang...,"Welcome speech by Mr Burkhard Balz, Member of ...",## Burkhard Balz: The digital €uro - a game ch...,Burkhard Balz,2025-09-24 00:00:00,20250924-eef747dc,Burkhard Balz,Deutsche Bundesbank,DE,neutral,20250924-eef747dc
20067,2025-09-24 06:48:00,https://www.bis.org/review/r250922e.htm,Michele Bullock: Opening statement - House of ...,,## Michele Bullock: Opening statement - House ...,Michele Bullock,2025-09-24 06:48:00,20250924-af9d82e8,Michele Bullock,Reserve Bank of Australia,AU,neutral,20250924-af9d82e8


In [None]:
speeches.to_parquet("gigando_speeches_ner.parquet")