In [8]:
# ----------------------------------------------------------------------------
# Imports: stdlib first, then 3rd-party
# ----------------------------------------------------------------------------
import re
import unicodedata
import string
from pathlib import Path
from typing import Any, Dict, List, Tuple

import geopandas as gpd
import pandas as pd
import spacy
from bertopic import BERTopic
from rapidfuzz import fuzz, process

# ----------------------------------------------------------------------------
# NLP model (load once at module-level)
# ----------------------------------------------------------------------------
_NLP = spacy.load("en_core_web_sm")


# ----------------------------------------------------------------------------
# Step 1: Load GeoJSON for IPC Geography
# ----------------------------------------------------------------------------
def load_geojson(file_path: str) -> gpd.GeoDataFrame:
    """
    Load a GeoJSON file into a GeoDataFrame.

    Args:
        file_path: Path to the GeoJSON file.

    Returns:
        Geographic boundaries as a GeoDataFrame.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file cannot be parsed.
    """
    path = Path(file_path)
    if not path.is_file():
        raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
    try:
        return gpd.read_file(path)
    except Exception as e:
        raise ValueError(f"Error loading GeoJSON: {e}") from e


# ----------------------------------------------------------------------------
# Step 2: Load Transcript Texts
# ----------------------------------------------------------------------------
def load_transcripts(transcripts_dir: str) -> pd.DataFrame:
    """
    Read transcript files from a directory and parse dates from filenames.

    Args:
        transcripts_dir: Directory path containing .txt files.

    Returns:
        DataFrame with columns ['file', 'date', 'text'].
    """
    records: List[Dict[str, Any]] = []
    for txt_path in sorted(Path(transcripts_dir).glob("*.txt")):
        match = re.search(r"(\d{1,2}-[A-Za-z]{3}-\d{4})", txt_path.name)
        date = (
            pd.to_datetime(match.group(1), format="%d-%b-%Y") 
            if match else pd.NaT
        )
        text = txt_path.read_text(encoding="utf-8")
        records.append({"file": txt_path.name, "date": date, "text": text})
    return pd.DataFrame(records)


# ----------------------------------------------------------------------------
# Step 3: Text Cleaning & Location Extraction
# ----------------------------------------------------------------------------
def clean_text(text: str) -> str:
    """
    Normalize text: lowercase, strip accents & punctuation.

    Args:
        text: Raw input.

    Returns:
        Cleaned string.
    """
    lowered = text.lower()
    normalized = unicodedata.normalize("NFD", lowered)
    no_accents = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
    no_punct = no_accents.translate(str.maketrans("", "", string.punctuation))
    return no_punct.strip()


def extract_locations(text: str) -> List[str]:
    """
    Extract GPE/LOC entities from text via spaCy.

    Args:
        text: Input transcript.

    Returns:
        List of location strings.
    """
    doc = _NLP(text)
    return [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]


def assign_geography(
    locations: List[str],
    geo_df: gpd.GeoDataFrame,
    area_col: str = "area",
    score_cutoff: int = 80,
) -> str:
    """
    Fuzzy-match locations to IPC areas.

    Args:
        locations: Extracted place names.
        geo_df: GeoDataFrame with an `area` column.
        score_cutoff: Minimum similarity (0–100).

    Returns:
        Best-matched area or "Unknown".
    """
    area_list = geo_df[area_col].dropna().tolist()
    clean_areas = [clean_text(a) for a in area_list]
    best = ("Unknown", 0)
    for loc in locations:
        loc_clean = clean_text(loc)
        match, score, _ = process.extractOne(
            query=loc_clean,
            choices=clean_areas,
            scorer=fuzz.token_sort_ratio,
        )
        if match and score >= score_cutoff and score > best[1]:
            best = (area_list[clean_areas.index(match)], score)
    return best[0]


# ----------------------------------------------------------------------------
# Step 4: Fit BERTopic Model & Get Probabilities
# ----------------------------------------------------------------------------
def fit_topic_model(
    docs: List[str],
    model_kwargs: Dict[str, Any] = None
) -> Tuple[BERTopic, List[int], List[List[float]]]:
    """
    Fit BERTopic and return primary topics + probability matrix.

    Args:
        docs: Corpus as list of strings.
        model_kwargs: BERTopic init args.

    Returns:
        model, topic_ids, probabilities matrix.
    """
    model = BERTopic(**(model_kwargs or {}), calculate_probabilities=True)
    topics, probs = model.fit_transform(docs)
    return model, topics, probs.tolist()


def select_themes_per_doc(
    probabilities: List[List[float]],
    theme_map: Dict[int, str],
    prob_threshold: float = 0.1,
) -> List[List[str]]:
    """
    For each doc, collect all theme names whose prob ≥ threshold.

    Args:
        probabilities: N_docs × N_topics.
        theme_map: topic_id → theme name.
        prob_threshold: cutoff.

    Returns:
        List of theme-lists per doc.
    """
    all_themes: List[List[str]] = []
    for doc_probs in probabilities:
        themes = [
            theme_map[tid]
            for tid, p in enumerate(doc_probs)
            if p >= prob_threshold and tid in theme_map
        ]
        all_themes.append(themes or ["Other"])
    return all_themes


# ----------------------------------------------------------------------------
# Step 5: Build & Explode Theme-Location Pairs
# ----------------------------------------------------------------------------
def build_theme_location_pairs(
    df: pd.DataFrame,
    topic_ids: List[int],
    themes_list: List[List[str]],
    geo_df: gpd.GeoDataFrame,
) -> pd.DataFrame:
    """
    Create one row per (file, date, theme, geography).

    Args:
        df: transcripts DataFrame.
        topic_ids: primary topic per doc (unused here but kept for reference).
        themes_list: list of themes per doc.
        geo_df: IPC boundaries.

    Returns:
        Exploded DataFrame with columns ['file','date','theme','geography'].
    """
    df = df.copy()
    df["themes"] = themes_list
    df["locations"] = df["text"].apply(extract_locations)
    df["geography"] = df["locations"].apply(lambda locs: assign_geography(locs, geo_df))
    # Explode so each theme has its own row
    exploded = df.explode("themes")
    return exploded[["file", "date", "themes", "geography"]].rename(
        columns={"themes": "theme"}
    )


# ----------------------------------------------------------------------------
# Main pipeline
# ----------------------------------------------------------------------------
def main(
    transcripts_dir: str,
    geojson_path: str,
    theme_map: Dict[int, str],
    model_kwargs: Dict[str, Any] = None,
    prob_threshold: float = 0.1,
) -> pd.DataFrame:
    """
    Run end-to-end: load, topic model, multi-themes, then pair themes with locations.

    Returns:
        DataFrame with one row per theme & its matched geography.
    """
    geo_df = load_geojson(geojson_path)
    transcripts_df = load_transcripts(transcripts_dir)
    docs = transcripts_df["text"].tolist()

    _, topic_ids, probabilities = fit_topic_model(docs, model_kwargs)
    themes_list = select_themes_per_doc(probabilities, theme_map, prob_threshold)

    return build_theme_location_pairs(
        df=transcripts_df,
        topic_ids=topic_ids,
        themes_list=themes_list,
        geo_df=geo_df,
    )


if __name__ == "__main__":
    TRANSCRIPTS_DIR = (
        "/teamspace/studios/this_studio/"
        "somali-radios-with-ai-for-food-security/"
        "1_phase/data/"
        "english_transcription_soundcloud_2024-07-01_to_2024-09-30"
    )
    GEOJSON_PATH = (
        "/teamspace/studios/this_studio/"
        "somali-radios-with-ai-for-food-security/"
        "1_phase/data/"
        "Somalia-Somalia IPC Post GU 2024.json"
    )
    THEME_MAP = {
        0: "Rainfall",
        1: "Crop Failure",
        2: "Livestock Health",
        3: "Higher Food Prices",
        4: "Humanitarian Aid",
    }
    MODEL_KWARGS = {"min_topic_size": 5, "n_gram_range": (1, 2)}
    PROB_THRESHOLD = 0.1

    df_pairs = main(
        TRANSCRIPTS_DIR,
        GEOJSON_PATH,
        THEME_MAP,
        model_kwargs=MODEL_KWARGS,
        prob_threshold=PROB_THRESHOLD,
    )
    print(df_pairs.head())


                        file       date         theme geography
0  IDAACADDA 01-AUG-2024.txt 2024-08-01  Crop Failure   Unknown
1  IDAACADDA 01-JUL-2024.txt 2024-07-01  Crop Failure  Baydhaba
2  IDAACADDA 01-SEP-2024.txt 2024-09-01      Rainfall      Baki
3  IDAACADDA 02-AUG-2024.txt 2024-08-02      Rainfall   Garoowe
3  IDAACADDA 02-AUG-2024.txt 2024-08-02  Crop Failure   Garoowe


In [9]:
df_pairs

Unnamed: 0,file,date,theme,geography
0,IDAACADDA 01-AUG-2024.txt,2024-08-01,Crop Failure,Unknown
1,IDAACADDA 01-JUL-2024.txt,2024-07-01,Crop Failure,Baydhaba
2,IDAACADDA 01-SEP-2024.txt,2024-09-01,Rainfall,Baki
3,IDAACADDA 02-AUG-2024.txt,2024-08-02,Rainfall,Garoowe
3,IDAACADDA 02-AUG-2024.txt,2024-08-02,Crop Failure,Garoowe
...,...,...,...,...
74,IDAACADDA-19-SEP-2024.txt,2024-09-19,Crop Failure,Garoowe
75,IDAACADDA-26-SEP-2024.txt,2024-09-26,Rainfall,Garoowe
75,IDAACADDA-26-SEP-2024.txt,2024-09-26,Crop Failure,Garoowe
76,IDAACADDA-29-AUG-2024.txt,2024-08-29,Rainfall,Jilib
