In [None]:
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.metrics import f1_score
from pathlib import Path

my_df = pd.read_csv(r'..\data\humaid\plabel\train\union.tsv', sep='\t')
their_folder = Path("..\data\humaid\k_zero_shot")

rows = []

for event_file in their_folder.glob("*.tsv"):
    their_df = pd.read_csv(event_file, sep='\t')
    event = '_'.join(event_file.stem.split('_')[:3])

    merged = pd.merge(my_df[my_df['event'] == event], their_df, on='tweet_id')
    merged = merged[['class_label', 'label', 'gpt5_label']]

    my_f1 = f1_score(merged['class_label'], merged['label'], average='macro')
    their_f1 = f1_score(merged['class_label'], merged['gpt5_label'], average='macro')

    rows.append({'event': event, 'mine': my_f1, 'theirs': their_f1})

pd.DataFrame(rows).to_clipboard()





In [None]:
# Normal

from openai import OpenAI
import csv, pandas as pd
import os, time, random
from dotenv import load_dotenv

# --- Initialize API client ---
load_dotenv(dotenv_path="../.env")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# --- Load HumAID test data ---
gold_table = pd.read_csv(
    "../data/humaid/joined/test.tsv",
    sep="\t",
    quoting=csv.QUOTE_NONE
)
gold_table = gold_table[gold_table["class_label"] != "other_relevant_information"].reset_index(drop=True)
# FOR TESTING
gold_table = gold_table.sample(n=40, random_state=42).reset_index(drop=True)

# --- Define category definitions ---
CATEGORY_DEFINITIONS = {
    "caution_and_advice": "Reports of warnings issued or lifted, guidance and tips related to the disaster.",
    "sympathy_and_support": "Tweets with prayers, thoughts, and emotional support.",
    "requests_or_urgent_needs": "Reports of urgent needs or supplies such as food, water, clothing, money, etc.",
    "displaced_people_and_evacuations": "People who have relocated due to the crisis, even for a short time.",
    "injured_or_dead_people": "Reports of injured or dead people due to the disaster.",
    "missing_or_found_people": "Reports of missing or found people due to the disaster.",
    "infrastructure_and_utility_damage": "Reports of any type of damage to infrastructure such as buildings, houses, roads, power lines, etc.",
    "rescue_volunteering_or_donation_effort": "Reports of any type of rescue, volunteering, or donation efforts.",
    "not_humanitarian": "If the tweet does not convey humanitarian aid-related information."
}

# --- Dynamic prompt generator ---
def make_prompt(tweet_text, event_labels):
    label_defs = "\n".join([
        f"- {label.replace('_', ' ').title()}: {CATEGORY_DEFINITIONS[label]}"
        for label in event_labels
    ])
    return f"""Read the category names and their definitions below, then classify the following tweet into the appropriate category. 
In your response, mention only the category name.

Category name: category definition
{label_defs}

Tweet: {tweet_text}
Category:"""

# --- Classifier function ---
def classify_tweet(tweet, event_labels, max_retries=3):
    prompt = make_prompt(tweet, event_labels)
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=10,
            )
            return response.choices[0].message.content.strip().replace(' ', '_').lower()
        except Exception as e:
            wait = 2 ** attempt + random.random()
            print(f"Error: {e} — retrying in {wait:.1f}s")
            time.sleep(wait)
    return "ERROR"

# --- Run predictions grouped by event ---
labels = []
for event_name, event_df in gold_table.groupby("event"):
    # Determine which labels exist in this event
    event_labels = event_df["class_label"].str.replace(" ", "_").str.lower().unique().tolist()
    event_labels = [lbl for lbl in event_labels if lbl in CATEGORY_DEFINITIONS]

    print(f"Processing event: {event_name} ({len(event_df)} tweets) | Labels: {event_labels}")

    for i, row in event_df.iterrows():
        pred = classify_tweet(str(row["tweet_text"]), event_labels)
        if pred == "ERROR":
            raise RuntimeError(f"Error encountered at row {i}. Stopping run.")
        labels.append(pred)
        if (len(labels)) % 50 == 0:
            print(f"Processed {len(labels)}/{len(gold_table)} tweets...")
            pd.DataFrame({"tweet_text": gold_table["tweet_text"][:len(labels)], "prediction": labels}).to_csv(
                "../gpt4o_mini_predictions_partial2.tsv", index=False, sep="\t"
            )

# --- Save final predictions ---
gold_table["prediction"] = labels
gold_table.to_csv("../gpt4o_mini_predictions2.tsv", index=False, sep="\t")
print("Done. Saved to ../gpt4o_mini_predictions2.tsv.")


Processing event: california_wildfires_2018 (10 tweets) | Labels: ['missing_or_found_people', 'sympathy_and_support', 'not_humanitarian', 'rescue_volunteering_or_donation_effort']
Processing event: hurricane_dorian_2019 (8 tweets) | Labels: ['displaced_people_and_evacuations', 'not_humanitarian', 'sympathy_and_support', 'caution_and_advice', 'rescue_volunteering_or_donation_effort']
Processing event: hurricane_florence_2018 (1 tweets) | Labels: ['infrastructure_and_utility_damage']
Processing event: hurricane_irma_2017 (5 tweets) | Labels: ['caution_and_advice', 'infrastructure_and_utility_damage', 'injured_or_dead_people']
Processing event: hurricane_maria_2017 (11 tweets) | Labels: ['rescue_volunteering_or_donation_effort', 'injured_or_dead_people', 'sympathy_and_support', 'infrastructure_and_utility_damage']
Processing event: kerala_floods_2018 (5 tweets) | Labels: ['rescue_volunteering_or_donation_effort', 'injured_or_dead_people']
Done. Saved to ../gpt4o_mini_predictions.tsv.


In [3]:
from openai import OpenAI
import csv, pandas as pd
import os, time, random, json
from dotenv import load_dotenv

# --- Initialize API client ---
load_dotenv(dotenv_path="../.env")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# --- Load HumAID test data ---
gold_table = pd.read_csv(
    "../data/humaid/joined/test.tsv",
    sep="\t",
    quoting=csv.QUOTE_NONE
)
gold_table = gold_table[gold_table["class_label"] != "other_relevant_information"].reset_index(drop=True)

# FOR TESTING
gold_table = gold_table.sample(n=40, random_state=42).reset_index(drop=True)

# --- Define category definitions ---
CATEGORY_DEFINITIONS = {
    "caution_and_advice": "Reports of warnings issued or lifted, guidance and tips related to the disaster.",
    "sympathy_and_support": "Tweets with prayers, thoughts, and emotional support.",
    "requests_or_urgent_needs": "Reports of urgent needs or supplies such as food, water, clothing, money, etc.",
    "displaced_people_and_evacuations": "People who have relocated due to the crisis, even for a short time.",
    "injured_or_dead_people": "Reports of injured or dead people due to the disaster.",
    "missing_or_found_people": "Reports of missing or found people due to the disaster.",
    "infrastructure_and_utility_damage": "Reports of any type of damage to infrastructure such as buildings, houses, roads, power lines, etc.",
    "rescue_volunteering_or_donation_effort": "Reports of any type of rescue, volunteering, or donation efforts.",
    "not_humanitarian": "If the tweet does not convey humanitarian aid-related information."
}

# --- Dynamic prompt generator ---
def make_prompt(tweet_text, event_labels):
    label_defs = "\n".join([
        f"- {label.replace('_', ' ').title()}: {CATEGORY_DEFINITIONS[label]}"
        for label in event_labels
    ])
    valid_list = ", ".join(event_labels)
    return f"""Classify the following tweet into ONE of the valid categories listed below.

Category name: category definition
{label_defs}

Return ONLY a valid label from this JSON format:
{{"category": "<one of [{valid_list}]>"}}

Tweet: {tweet_text}"""

# --- Classifier function ---
def classify_tweet(tweet, event_labels, max_retries=3):
    prompt = make_prompt(tweet, event_labels)
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"},
                temperature=0,
                max_tokens=20,
            )
            content = response.choices[0].message.content
            data = json.loads(content)
            label = data.get("category", "").strip().lower()
            if label not in event_labels:
                raise ValueError(f"Invalid label '{label}' returned")
            return label
        except Exception as e:
            wait = 2 ** attempt + random.random()
            print(f"Error: {e} — retrying in {wait:.1f}s")
            time.sleep(wait)
    return "ERROR"

# --- Run predictions grouped by event ---
labels = []
for event_name, event_df in gold_table.groupby("event"):
    event_labels = event_df["class_label"].str.replace(" ", "_").str.lower().unique().tolist()
    event_labels = [lbl for lbl in event_labels if lbl in CATEGORY_DEFINITIONS]

    print(f"Processing event: {event_name} ({len(event_df)} tweets) | Labels: {event_labels}")

    for i, row in event_df.iterrows():
        pred = classify_tweet(str(row["tweet_text"]), event_labels)
        if pred == "ERROR":
            raise RuntimeError(f"Error encountered at row {i}. Stopping run.")
        labels.append(pred)
        if (len(labels)) % 50 == 0:
            print(f"Processed {len(labels)}/{len(gold_table)} tweets...")
            pd.DataFrame({"tweet_text": gold_table["tweet_text"][:len(labels)], "prediction": labels}).to_csv(
                "../gpt4o_mini_predictions_partial2.tsv", index=False, sep="\t"
            )

# --- Save final predictions ---
gold_table["prediction"] = labels
gold_table.to_csv("../gpt4o_mini_predictions2.tsv", index=False, sep="\t")
print("Done. Saved to ../gpt4o_mini_predictions2.tsv.")


Processing event: california_wildfires_2018 (10 tweets) | Labels: ['missing_or_found_people', 'sympathy_and_support', 'not_humanitarian', 'rescue_volunteering_or_donation_effort']


Processing event: hurricane_dorian_2019 (8 tweets) | Labels: ['displaced_people_and_evacuations', 'not_humanitarian', 'sympathy_and_support', 'caution_and_advice', 'rescue_volunteering_or_donation_effort']
Processing event: hurricane_florence_2018 (1 tweets) | Labels: ['infrastructure_and_utility_damage']
Processing event: hurricane_irma_2017 (5 tweets) | Labels: ['caution_and_advice', 'infrastructure_and_utility_damage', 'injured_or_dead_people']
Processing event: hurricane_maria_2017 (11 tweets) | Labels: ['rescue_volunteering_or_donation_effort', 'injured_or_dead_people', 'sympathy_and_support', 'infrastructure_and_utility_damage']
Processing event: kerala_floods_2018 (5 tweets) | Labels: ['rescue_volunteering_or_donation_effort', 'injured_or_dead_people']
Done. Saved to ../gpt4o_mini_predictions2.tsv.


In [None]:
# Tool Calling
from openai import OpenAI
import csv, pandas as pd
import os, time, random, json
from dotenv import load_dotenv

# --- Initialize API client ---
load_dotenv(dotenv_path="../.env")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# --- Load HumAID test data ---
gold_table = pd.read_csv(
    "../data/humaid/joined/test.tsv",
    sep="\t",
    quoting=csv.QUOTE_NONE
)
gold_table = gold_table[gold_table["class_label"] != "other_relevant_information"].reset_index(drop=True)
gold_table = gold_table.sample(n=40, random_state=42).reset_index(drop=True)

# --- Define category definitions ---
CATEGORY_DEFINITIONS = {
    "caution_and_advice": "Reports of warnings issued or lifted, guidance and tips related to the disaster.",
    "sympathy_and_support": "Tweets with prayers, thoughts, and emotional support.",
    "requests_or_urgent_needs": "Reports of urgent needs or supplies such as food, water, clothing, money, etc.",
    "displaced_people_and_evacuations": "People who have relocated due to the crisis, even for a short time.",
    "injured_or_dead_people": "Reports of injured or dead people due to the disaster.",
    "missing_or_found_people": "Reports of missing or found people due to the disaster.",
    "infrastructure_and_utility_damage": "Reports of any type of damage to infrastructure such as buildings, houses, roads, power lines, etc.",
    "rescue_volunteering_or_donation_effort": "Reports of any type of rescue, volunteering, or donation efforts.",
    "not_humanitarian": "If the tweet does not convey humanitarian aid-related information."
}

# --- Function schema for tool calling ---
def make_classification_tool(event_labels):
    return {
        "name": "classify_tweet",
        "description": "Selects the best-matching humanitarian category for a tweet.",
        "parameters": {
            "type": "object",
            "properties": {
                "label": {
                    "type": "string",
                    "enum": event_labels
                }
            },
            "required": ["label"]
        }
    }

# --- Classifier function using function calling ---
def classify_tweet(tweet, event_labels, max_retries=3):
    prompt = (
        "Read the category names and their definitions below, then classify the following tweet "
        "into one of the allowed categories.\n\n"
        + "\n".join([f"- {label}: {CATEGORY_DEFINITIONS[label]}" for label in event_labels])
        + f"\n\nTweet: {tweet}\nReturn one of: {event_labels}"
    )

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                tools=[{"type": "function", "function": make_classification_tool(event_labels)}],
                tool_choice={"type": "function", "function": {"name": "classify_tweet"}},
                temperature=0,
            )

            tool_call = response.choices[0].message.tool_calls[0]
            args = json.loads(tool_call.function.arguments)
            label = args["label"].strip().lower()
            if label in event_labels:
                return label
            print(f"Invalid label '{label}', retrying...")
        except Exception as e:
            wait = 2 ** attempt + random.random()
            print(f"Error: {e} — retrying in {wait:.1f}s")
            time.sleep(wait)
    return "ERROR"

# --- Run predictions grouped by event ---
labels = []
for event_name, event_df in gold_table.groupby("event"):
    event_labels = event_df["class_label"].str.replace(" ", "_").str.lower().unique().tolist()
    event_labels = [lbl for lbl in event_labels if lbl in CATEGORY_DEFINITIONS]

    print(f"Processing event: {event_name} ({len(event_df)} tweets) | Labels: {event_labels}")

    for i, row in event_df.iterrows():
        pred = classify_tweet(str(row["tweet_text"]), event_labels)
        if pred == "ERROR":
            raise RuntimeError(f"Error encountered at row {i}. Stopping run.")
        labels.append(pred)
        if len(labels) % 50 == 0:
            print(f"Processed {len(labels)}/{len(gold_table)} tweets...")
            pd.DataFrame({"tweet_text": gold_table["tweet_text"][:len(labels)], "prediction": labels}).to_csv(
                "../gpt4o_mini_predictions_partial2.tsv", index=False, sep="\t"
            )

# --- Save final predictions ---
gold_table["prediction"] = labels
gold_table.to_csv("../gpt4o_mini_predictions2.tsv", index=False, sep="\t")
print("Done. Saved to ../gpt4o_mini_predictions2.tsv.")
