
# Combine `train/predictions.csv` Across Subfolders

This utility does exactly the following, in a **simple** way:

- For each subfolder inside a chosen *base folder* (e.g., "anh's folder"),
- Open `./train/predictions.csv` in that subfolder,
- Add an `event` column based on the subfolder name,
- Combine all rows together,
- Rename `pred` column to `label`,
- Save the result as a single TSV file.

> Edit the `BASE_FOLDER` and `OUTPUT_TSV` variables below and run the cell.


In [6]:

from pathlib import Path
import pandas as pd

# === USER SETTINGS ===
# Set the path to the base folder that contains event subfolders (each with ./train/predictions.csv)
BASE_FOLDER = Path(r"..\data\humaid\anh_4o")  # <- change me
OUTPUT_TSV = Path("./all_events.tsv")  # where to save the combined TSV

def get_first_subfolder(path_obj):
    """
    Returns the first subfolder found within a given pathlib.Path object.
    Returns None if no subfolders are found.
    """
    for item in path_obj.iterdir():
        if item.is_dir():
            return item
    return None

def combine_predictions(base_folder: Path, output_tsv: Path):
    base_folder = Path(base_folder)
    all_rows = []
    subfolders = [p for p in base_folder.iterdir() if p.is_dir()]
    if not subfolders:
        print(f"No subfolders found in: {base_folder}")
        return

    for sub in sorted(subfolders):
        pred_csv = get_first_subfolder(sub / "train" / "gpt-4o-mini") / "predictions.csv"
        if not pred_csv.exists():
            print(f"Skipping (missing file): {pred_csv}")
            continue

        try:
            df = pd.read_csv(pred_csv)
        except Exception as e:
            print(f"Failed to read {pred_csv}: {e}")
            continue

        # Add event column from the subfolder name
        df["event"] = sub.name

        # Rename 'pred' -> 'label' if present
        if "predicted_label" in df.columns and "label" not in df.columns:
            df = df.rename(columns={"predicted_label": "label"})

        all_rows.append(df)
        print(f"Loaded: {pred_csv}  (rows: {len(df)})")

    if not all_rows:
        print("No prediction files were loaded. Nothing to combine.")
        return

    combined = pd.concat(all_rows, ignore_index=True)

    # Ensure final column is named 'label' (if neither 'pred' nor 'label' existed, we do nothing)
    # If both existed, we keep 'label' and drop 'pred' to avoid duplicates.
    if "predicted_label" in combined.columns and "label" in combined.columns:
        combined = combined.drop(columns=["predicted_label"])

    combined.to_csv(output_tsv, sep="\t", index=False)
    print(f"Saved combined TSV to: {output_tsv.resolve()}")
    return combined

# Run the utility
import os
print(os.getcwd())
combined_df = combine_predictions(BASE_FOLDER, OUTPUT_TSV)
if combined_df is not None:
    print(combined_df.head())


c:\Users\gd3470\Desktop\ssl\utils
Loaded: ..\data\humaid\anh_4o\california_wildfires_2018\train\gpt-4o-mini\20251021-222139-modeS-RULES1-ALT\predictions.csv  (rows: 5163)
Loaded: ..\data\humaid\anh_4o\canada_wildfires_2016\train\gpt-4o-mini\20251021-202956-modeS-RULES1-TIER1\predictions.csv  (rows: 1569)
Loaded: ..\data\humaid\anh_4o\cyclone_idai_2019\train\gpt-4o-mini\20251021-205521-modeS-RULES1-TIER1\predictions.csv  (rows: 2753)
Loaded: ..\data\humaid\anh_4o\hurricane_dorian_2019\train\gpt-4o-mini\20251021-224703-modeS-RULES1-ALT\predictions.csv  (rows: 5329)
Loaded: ..\data\humaid\anh_4o\hurricane_florence_2018\train\gpt-4o-mini\20251021-212048-modeS-RULES1-ALT\predictions.csv  (rows: 4384)
Loaded: ..\data\humaid\anh_4o\hurricane_harvey_2017\train\gpt-4o-mini\20251022-001305-modeS-RULES1-ALT\predictions.csv  (rows: 6378)
Loaded: ..\data\humaid\anh_4o\hurricane_irma_2017\train\gpt-4o-mini\20251022-004334-modeS-RULES1-ALT\predictions.csv  (rows: 6579)
Loaded: ..\data\humaid\anh_4o\h