In [5]:
import re
from pathlib import Path
from typing import Any, Dict, List, Tuple

import pandas as pd
import geopandas as gpd
import spacy
from bertopic import BERTopic

# ----------------------------------------------------------------------------
# Step 1: Load GeoJSON for IPC geography
# ----------------------------------------------------------------------------

def load_geojson(file_path: str) -> gpd.GeoDataFrame:
    """
    Load a GeoJSON file into a GeoDataFrame.

    Args:
        file_path (str): Path to the GeoJSON file.

    Returns:
        gpd.GeoDataFrame: The loaded geographic boundaries.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file cannot be parsed.
    """
    path = Path(file_path)
    if not path.is_file():
        raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
    try:
        return gpd.read_file(path)
    except Exception as e:
        raise ValueError(f"Error loading GeoJSON: {e}")

# ----------------------------------------------------------------------------
# Step 2: Load Transcript Texts
# ----------------------------------------------------------------------------

def load_transcripts(transcripts_dir: str) -> pd.DataFrame:
    """
    Read transcripts from a directory and parse dates from filenames.

    Args:
        transcripts_dir (str): Directory containing .txt transcript files.

    Returns:
        pd.DataFrame: Columns [file, date, text].
    """
    records: List[Dict[str, Any]] = []
    for txt_path in sorted(Path(transcripts_dir).glob("*.txt")):
        # Parse date pattern DD-MMM-YYYY
        match = re.search(r"(\d{1,2}-[A-Za-z]{3}-\d{4})", txt_path.name)
        date = pd.to_datetime(match.group(1), format="%d-%b-%Y") if match else pd.NaT
        text = txt_path.read_text(encoding="utf-8")
        records.append({"file": txt_path.name, "date": date, "text": text})
    return pd.DataFrame(records)

# ----------------------------------------------------------------------------
# Step 3: Fit BERTopic Model
# ----------------------------------------------------------------------------

def fit_topic_model(
    docs: List[str],
    model_kwargs: Dict[str, Any] = None
) -> Tuple[BERTopic, List[int]]:
    """
    Fit a BERTopic model to the documents.

    Args:
        docs (List[str]): Texts to analyze.
        model_kwargs (Dict[str, Any], optional): BERTopic parameters.

    Returns:
        Tuple[BERTopic, List[int]]: The fitted model and topic IDs per doc.
    """
    model = BERTopic(**(model_kwargs or {}))
    topics, _ = model.fit_transform(docs)
    return model, topics

# ----------------------------------------------------------------------------
# Step 4: Map Topics to Themes
# ----------------------------------------------------------------------------

def map_topics_to_themes(
    topics: List[int],
    theme_map: Dict[int, str]
) -> List[str]:
    """
    Convert numeric topic IDs to human-readable themes.

    Args:
        topics (List[int]): Topic IDs from BERTopic.
        theme_map (Dict[int, str]): Mapping of topic_id to theme name.

    Returns:
        List[str]: Theme label for each document.
    """
    return [theme_map.get(t, "Other") for t in topics]

# ----------------------------------------------------------------------------
# Step 5: Extract Locations from Text
# ----------------------------------------------------------------------------

# Load spaCy model once
_nlp = spacy.load("en_core_web_sm")

def extract_locations(text: str) -> List[str]:
    """
    Identify place names in text using spaCy NER.

    Args:
        text (str): Transcript content.

    Returns:
        List[str]: Extracted GPE/LOC entities.
    """
    doc = _nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]

# ----------------------------------------------------------------------------
# Step 6: Assign Geography based on IPC GeoDataFrame
# ----------------------------------------------------------------------------

def assign_geography(
    locations: List[str],
    geo_df: gpd.GeoDataFrame,
    area_col: str = "area"
) -> str:
    """
    Match extracted locations to IPC areas.

    Args:
        locations (List[str]): Place names from transcript.
        geo_df (gpd.GeoDataFrame): IPC boundaries with an 'area' column.
        area_col (str): Column name for area labels.

    Returns:
        str: Matched area or 'Unknown'.
    """
    area_list = geo_df[area_col].str.lower().tolist()
    for loc in locations:
        low_loc = loc.lower()
        for area in area_list:
            if area in low_loc:
                return area.title()
    return "Unknown"

# ----------------------------------------------------------------------------
# Step 7: Build Labeled Dataset
# ----------------------------------------------------------------------------

def build_labeled_dataset(
    df: pd.DataFrame,
    topics: List[int],
    themes: List[str],
    geo_df: gpd.GeoDataFrame
) -> pd.DataFrame:
    """
    Assemble final table with file, date, topic, theme, and geography.

    Args:
        df (pd.DataFrame): Output of load_transcripts().
        topics (List[int]): Numeric topics per document.
        themes (List[str]): Theme labels per document.
        geo_df (gpd.GeoDataFrame): IPC geographic boundaries.

    Returns:
        pd.DataFrame: Columns [file, date, topic_id, theme, geography].
    """
    df = df.copy()
    df["topic_id"] = topics
    df["theme"] = themes
    df["locations"] = df["text"].apply(extract_locations)
    df["geography"] = df["locations"].apply(lambda locs: assign_geography(locs, geo_df))
    return df[["file", "date", "topic_id", "theme", "geography"]]



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/traitlets/config/application.

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
# Paths
transcripts_dir = "/teamspace/studios/this_studio/somali-radios-with-ai-for-food-security/1_phase/data/transcription_soundcloud_2024-07-01_to_2024-09-30"
geojson_path = "/path/to/Somalia-Somalia IPC Post GU 2024.json"


In [None]:
# Load data
geo_df = load_geojson(geojson_path)
transcripts_df = load_transcripts(transcripts_dir)

In [None]:
# Fit topic model
theme_map = {0: "Rainfall", 1: "Crop Failure", 2: "Livestock Health", 3: "Prices", 4: "Humanitarian Aid"}
model, topics = fit_topic_model(
    transcripts_df["text"].tolist(),
    model_kwargs={"min_topic_size": 5, "n_gram_range": (1, 2)}
)

In [None]:
# Map to themes and build final dataset
themes = map_topics_to_themes(topics, theme_map)
labeled_df = build_labeled_dataset(transcripts_df, topics, themes, geo_df)

In [None]:
# Save results
labeled_df.to_csv("labeled_transcripts.csv", index=False)
print("Saved labeled_transcripts.csv")