In [4]:
"""
SYNTHETIC DATA GENERATION FOR QUECHUA MORPHOLOGY PARSER
=======================================================

This notebook generates synthetic morphological segmentation data using GPT models.
It consists of two main parts:

PART 1 (Cell 0): Data Analysis and Gold Standard Creation
- Analyzes corpus coverage and word rarity
- Creates a "gold standard" dataset of high-quality segmentations
- Filters words based on corpus frequency and root consistency

PART 2 (Cell 1+): Synthetic Data Generation
- Uses GPT-4o and GPT-5-mini to generate morphological segmentations
- Applies few-shot learning with gold standard examples
- Saves results to appropriately named files in the data folder

The synthetic data is used to augment training data for the morphology parser.
"""

import pandas as pd
import os
import regex as re
from collections import Counter
import ast

# These imports are needed for the norm_unicode function
import unicodedata
from ftfy import fix_text

# =========================
# DATA FOLDER CONFIGURATION
# =========================
# All data files should be read from and saved to the data folder
DATA_FOLDER = "data"

# =========================
# CONFIGURATION
# =========================
# --- Input File Paths (read from data folder) ---
CORPUS_FILE = os.path.join(DATA_FOLDER, "qu_merged_dump.txt")
GOLD_DF_FILE = os.path.join(DATA_FOLDER, "Sue_kalt.parquet")  # Gold standard dataset (formerly called combined_df)
CLEANED_DF_FILE = os.path.join(DATA_FOLDER, "cleaned_data_df.csv")

# --- Output File Names (save to data folder) ---
GOLD_OUTPUT_FILE = os.path.join(DATA_FOLDER, "gold_df_common_words.csv")
CLEANED_OUTPUT_FILE = os.path.join(DATA_FOLDER, "cleaned_data_df_common_words.csv")
# Gold standard dataset: high-quality examples for few-shot learning
COMMON_WORDS_OUTPUT_FILE = os.path.join(DATA_FOLDER, "word_analysis_gold.csv") 

# --- Analysis Parameters ---
RARE_WORD_RANK_THRESHOLD = 100000
LOWERCASE = True
KEEP_APOSTROPHES = True  # apostrophes will be considered invalid for grapheme filtering (so tokens with them are dropped)

# =========================
# GRAPHEMES
# =========================
graphemes = [
    "ch","ll","rr","tr","kw","ph",  # digraphs/trigraphs
    "a","b","d","e","f","g","h","i","k","l","m","n","ñ","o","p","q",
    "r","s","t","u","v","w","x","y"
]

# Precompute sorted-by-length graphemes for greedy matching (longest first)
GRAPHEMES_BY_LEN = sorted(graphemes, key=len, reverse=True)
SINGLE_CHARS = {g for g in graphemes if len(g) == 1}

# =========================
# HELPER FUNCTIONS
# =========================

# This utility function was part of your original zipf.py context and is needed here
CTRL_RE = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]')
def norm_unicode(x, form="NFC"):
    if pd.isna(x):
        return x
    s = x.decode("utf-8", "replace") if isinstance(x, (bytes, bytearray)) else str(x)
    s = fix_text(s)
    s = CTRL_RE.sub('', s)
    s = unicodedata.normalize(form, s)
    s = s.replace('\u00A0', ' ')
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def tokenize_graphemes(word: str):
    """
    Greedy longest-match tokenizer over the allowed graphemes.
    Returns a list of graphemes if fully tokenized, else None.
    """
    if not isinstance(word, str):
        return None
    w = word.strip()
    if LOWERCASE:
        w = w.lower()

    # If we allow apostrophes in tokenization stage, we still reject them for grapheme validity.
    # So any apostrophe in the surface string => invalid grapheme word.
    if "'" in w or "’" in w:
        return None

    i = 0
    toks = []
    n = len(w)
    while i < n:
        matched = False
        # Try longest graphemes first (trigraph/digraph)
        for g in GRAPHEMES_BY_LEN:
            L = len(g)
            if i + L <= n and w[i:i+L] == g:
                toks.append(g)
                i += L
                matched = True
                break
        if not matched:
            # No grapheme matched: invalid word
            return None
    return toks

def is_valid_grapheme_word(word: str) -> bool:
    """A word is valid iff it can be fully segmented into the allowed graphemes."""
    toks = tokenize_graphemes(word)
    return toks is not None

def first_four_graphemes_root(word: str) -> str:
    """
    Compute the corpus root as the concatenation of the first 4 graphemes.
    If fewer than 4 graphemes, use whatever is available (empty if none).
    Returns '' if the word is not a valid grapheme word.
    """
    toks = tokenize_graphemes(word)
    if toks is None or len(toks) == 0:
        return ''
    root = ''.join(toks[:4])
    return root

def safe_first_segment(row, prefer_list_col="Morph_split", fallback_str_col="Morph_split_str"):
    """
    For gold_df / cleaned_df: return the first segment (root) robustly.
    - If Morph_split is a list, use its first element.
    - If Morph_split is a string representation of a list, try literal_eval.
    - Else, fall back to splitting Morph_split_str (if present) on whitespace.
    Returns '' if none available.
    """
    # Try list column
    if prefer_list_col in row:
        val = row[prefer_list_col]
        # Already a list
        if isinstance(val, list) and len(val) > 0:
            return str(val[0]).strip()
        # String representation of a list
        if isinstance(val, str):
            s = val.strip()
            # Try to parse as list
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list) and len(parsed) > 0:
                    return str(parsed[0]).strip()
            except Exception:
                pass

    # Fallback to a plain string split column
    if fallback_str_col in row:
        s = row[fallback_str_col]
        if isinstance(s, str) and s.strip():
            return s.strip().split()[0]

    return ''

def process_corpus(file_path):
    """
    Reads a large text corpus, tokenizes it, and calculates word frequencies and ranks.
    **Updated**: Only counts tokens that are fully valid Quechua grapheme words.
    """
    print(f"Processing corpus file: {file_path}...")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Corpus file not found at: {file_path}")

    # Base tokenization (word-like). Apostrophes may be included in the token,
    # but we will drop any token that contains them since they're not allowed graphemes.
    TOKEN_RE = re.compile(r"[^\W\d_]+(?:['’][^\W\d_]+)?", flags=re.UNICODE) if KEEP_APOSTROPHES \
                else re.compile(r"[^\W\d_]+", flags=re.UNICODE)

    def iter_valid_tokens_from_file(path):
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            for line in f:
                if LOWERCASE:
                    line = line.lower()
                for m in TOKEN_RE.finditer(line):
                    tok = m.group(0)
                    # Keep only tokens that fully segment into allowed graphemes
                    if is_valid_grapheme_word(tok):
                        yield tok

    freq = Counter(iter_valid_tokens_from_file(file_path))

    if not freq:
        print("Warning: Corpus processing resulted in zero tokens after grapheme filtering.")
        return {}, {}
        
    print(f"Corpus processed. Total unique valid grapheme-words (vocabulary size): {len(freq):,}")

    sorted_words = [word for word, count in freq.most_common()]
    rank_map = {word: i + 1 for i, word in enumerate(sorted_words)}
    
    return dict(freq), rank_map

# =========================
# MAIN ANALYSIS
# =========================

if __name__ == "__main__":
    # Step 1: Process the corpus to get word frequencies and ranks (valid grapheme-words only)
    corpus_freq, corpus_rank = process_corpus(CORPUS_FILE)

    # Step 2: Load the full, prepped DataFrames
    gold_df = pd.read_parquet(GOLD_DF_FILE)
    gold_df['Word'] = gold_df['word']
    gold_df['morph'] = gold_df['morph'].str.replace('-', ' ')
    gold_df['Morph_split_str'] = gold_df['morph']
    gold_df['Morph_split'] = gold_df['morph'].str.split(' ')
    gold_df = gold_df[['Word', 'Morph_split', 'Morph_split_str']]

    cleaned_df = pd.read_csv(CLEANED_DF_FILE, encoding='windows-1252')

    # Extract sets of unique words for analysis (exact surface forms)
    gold_words = set(gold_df['Word'].dropna().unique())
    cleaned_words = set(cleaned_df['Word'].dropna().unique())
    corpus_words = set(corpus_freq.keys())  # Already filtered to allowed grapheme-words

    print("\n" + "="*50)
    print("ANALYSIS RESULTS")
    print("="*50 + "\n")

    # Step 3: Corpus Coverage Analysis (surface word coverage, unchanged)
    print("--- 1. Corpus Coverage Analysis (surface forms) ---")
    gold_in_corpus = gold_words.intersection(corpus_words)
    coverage_percentage = (len(gold_in_corpus) / len(gold_words)) * 100 if gold_words else 0
    print(f"[{GOLD_DF_FILE}]: Found {len(gold_in_corpus):,} / {len(gold_words):,} words in corpus ({coverage_percentage:.2f}% coverage).\n")

    cleaned_in_corpus = cleaned_words.intersection(corpus_words)
    coverage_percentage = (len(cleaned_in_corpus) / len(cleaned_words)) * 100 if cleaned_words else 0
    print(f"[{CLEANED_DF_FILE}]: Found {len(cleaned_in_corpus):,} / {len(cleaned_words):,} words in corpus ({coverage_percentage:.2f}% coverage).\n")

    # Step 4: Dataset Incongruity Analysis (surface forms, unchanged for backward compatibility)
    print("--- 2. Dataset Incongruity Analysis (surface forms) ---")
    words_in_common = gold_words.intersection(cleaned_words)
    words_only_in_gold = gold_words.difference(cleaned_words)
    words_only_in_cleaned = cleaned_words.difference(gold_words)
    common_and_in_corpus = words_in_common.intersection(corpus_words)

    print(f"Words common to BOTH datasets: {len(words_in_common):,}")
    print(f"Words in the corpus AND common to both datasets: {len(common_and_in_corpus):,}")

    print(f"Words ONLY in '{GOLD_DF_FILE}': {len(words_only_in_gold):,}")
    print(f"Words ONLY in '{CLEANED_DF_FILE}': {len(words_only_in_cleaned):,}\n")

    # =========================
    # NEW: ROOT-LEVEL ANALYSIS
    # =========================
    print("--- 2b. Root-Level Analysis (as requested) ---")

    # Corpus roots: first 4 graphemes
    corpus_roots = set()
    for w in corpus_words:
        r = first_four_graphemes_root(w)
        if r:
            corpus_roots.add(r)

    # Gold_df roots: first segment of segmentation
    gold_df = gold_df.copy()
    gold_df['Root'] = gold_df.apply(lambda row: safe_first_segment(row, "Morph_split", "Morph_split_str"), axis=1)
    gold_roots = set([r for r in gold_df['Root'].dropna().map(str).map(str.strip) if r])

    # Cleaned_df roots: same logic (handle list/str robustly)
    cleaned_df = cleaned_df.copy()
    # Ensure helpful fallback column for cleaned_df if not present
    if 'Morph_split_str' not in cleaned_df.columns:
        # try to form from Morph_split if it exists (string or list)
        if 'Morph_split' in cleaned_df.columns:
            def to_str_split(val):
                if isinstance(val, list):
                    return ' '.join(map(str, val))
                if isinstance(val, str):
                    try:
                        parsed = ast.literal_eval(val)
                        if isinstance(parsed, list):
                            return ' '.join(map(str, parsed))
                    except Exception:
                        return val
                return ''
            cleaned_df['Morph_split_str'] = cleaned_df['Morph_split'].apply(to_str_split)
        else:
            cleaned_df['Morph_split_str'] = ''

    cleaned_df['Root'] = cleaned_df.apply(lambda row: safe_first_segment(row, "Morph_split", "Morph_split_str"), axis=1)
    cleaned_roots = set([r for r in cleaned_df['Root'].dropna().map(str).map(str.strip) if r])

    # Report unique root counts
    print(f"Unique roots in CORPUS (first 4 graphemes): {len(corpus_roots):,}")
    print(f"Unique roots in {GOLD_DF_FILE} (first segment): {len(gold_roots):,}")
    print(f"Unique roots in {CLEANED_DF_FILE} (first segment): {len(cleaned_roots):,}")

    # Overlaps on roots
    roots_gold_cleaned = gold_roots.intersection(cleaned_roots)
    roots_gold_corpus  = gold_roots.intersection(corpus_roots)
    roots_cleaned_corpus   = cleaned_roots.intersection(corpus_roots)
    roots_all_three        = gold_roots.intersection(cleaned_roots).intersection(corpus_roots)

    print(f"Overlapping roots (gold ∩ cleaned): {len(roots_gold_cleaned):,}")
    print(f"Overlapping roots (gold ∩ corpus): {len(roots_gold_corpus):,}")
    print(f"Overlapping roots (cleaned ∩ corpus): {len(roots_cleaned_corpus):,}")
    print(f"Overlapping roots (gold ∩ cleaned ∩ corpus): {len(roots_all_three):,}\n")

    # Step 5: Rarity Analysis (unchanged; still on surface forms)
    print(f"--- 3. Rarity Analysis (Threshold: Top {RARE_WORD_RANK_THRESHOLD:,} words) ---")
    rare_words_in_gold = {word for word in gold_words if corpus_rank.get(word, float('inf')) > RARE_WORD_RANK_THRESHOLD}
    print(f"[{GOLD_DF_FILE}]: {len(rare_words_in_gold):,} words are 'rare' (rank > {RARE_WORD_RANK_THRESHOLD:,}).")
    
    rare_words_in_cleaned = {word for word in cleaned_words if corpus_rank.get(word, float('inf')) > RARE_WORD_RANK_THRESHOLD}
    print(f"[{CLEANED_DF_FILE}]: {len(rare_words_in_cleaned):,} words are 'rare' (rank > {RARE_WORD_RANK_THRESHOLD:,}).\n")
    
    # Step 6: Coverage of Non-Rare Words (unchanged; still on surface forms)
    print("--- 4. Coverage of Non-Rare Words ---")
    common_gold = gold_words - rare_words_in_gold
    common_cleaned = cleaned_words - rare_words_in_cleaned
    
    common_gold_in_corpus = common_gold.intersection(corpus_words)
    coverage_perc = (len(common_gold_in_corpus) / len(common_gold)) * 100 if common_gold else 0
    print(f"[{GOLD_DF_FILE}]: Of its {len(common_gold):,} non-rare words, {len(common_gold_in_corpus):,} ({coverage_perc:.2f}%) are in the corpus.")

    common_cleaned_in_corpus = common_cleaned.intersection(corpus_words)
    coverage_perc = (len(common_cleaned_in_corpus) / len(common_cleaned)) * 100 if common_cleaned else 0
    print(f"[{CLEANED_DF_FILE}]: Of its {len(common_cleaned):,} non-rare words, {len(common_cleaned_in_corpus):,} ({coverage_perc:.2f}%) are in the corpus.\n")

    # Step 7: Removing Rare Words and Saving New CSVs (unchanged)
    print("--- 5. Removing Rare Words and Saving New CSVs ---")

    # Filter the gold_df DataFrame
    if not gold_df.empty:
        filtered_gold_df = gold_df[~gold_df['Word'].isin(rare_words_in_gold)]
        filtered_gold_df.to_csv(GOLD_OUTPUT_FILE, index=False, encoding='utf-8')
        print(f"Removed {len(rare_words_in_gold)} rare words from '{GOLD_DF_FILE}'.")
        print(f"-> Saved {len(filtered_gold_df)} rows to '{GOLD_OUTPUT_FILE}'\n")

    # Filter the cleaned_data_df DataFrame
    if not cleaned_df.empty:
        filtered_cleaned_df = cleaned_df[~cleaned_df['Word'].isin(rare_words_in_cleaned)]
        filtered_cleaned_df.to_csv(CLEANED_OUTPUT_FILE, index=False, encoding='utf-8')
        print(f"Removed {len(rare_words_in_cleaned)} rare words from '{CLEANED_DF_FILE}'.")
        print(f"-> Saved {len(filtered_cleaned_df)} rows to '{CLEANED_OUTPUT_FILE}'\n")

        # =========================
    # 2c. WORD-LEVEL GOLD (common words with common roots) -> Word + Segmentation only
    # =========================
    print("--- 2c. Word-Level GOLD (common words with common roots) ---")

    # Helper to turn any Morph_split / Morph_split_str / morph into a clean space-separated string
    def _seg_str_from_row(row):
        # Prefer Morph_split if it's a list or list-like string
        if 'Morph_split' in row:
            ms = row['Morph_split']
            if isinstance(ms, list):
                s = ' '.join(map(str, ms)).strip()
                if s: return s
            if isinstance(ms, str):
                s = ms.strip()
                # try to parse list literal
                try:
                    parsed = ast.literal_eval(s)
                    if isinstance(parsed, list) and parsed:
                        s2 = ' '.join(map(str, parsed)).strip()
                        if s2: return s2
                except Exception:
                    # assume it's already space-separated pieces
                    if s: return s
        # Fallback to Morph_split_str
        if 'Morph_split_str' in row and isinstance(row['Morph_split_str'], str):
            s = row['Morph_split_str'].strip()
            if s: return s
        # Last resort: 'morph' (replace '-' with space)
        if 'morph' in row and isinstance(row['morph'], str):
            s = row['morph'].replace('-', ' ').strip()
            if s: return s
        return ''

    # Build fast lookups for roots and segmentations by Word from both labeled datasets
    # (take the first non-empty per Word)
    def _first_nonempty_map(df, value_col):
        tmp = (
            df[['Word', value_col]]
            .copy()
            .dropna(subset=['Word'])
        )
        tmp['Word'] = tmp['Word'].astype(str).str.strip()
        tmp[value_col] = tmp[value_col].astype(str).str.strip()
        tmp = tmp[tmp['Word'] != '']
        tmp = tmp[tmp[value_col] != '']
        return tmp.drop_duplicates(subset=['Word']).set_index('Word')[value_col].to_dict()

    # Ensure we have a Root column in both frames (already computed above)
    gold_root_map = _first_nonempty_map(gold_df.rename(columns={'Root':'__Root'}), '__Root')
    cleaned_root_map  = _first_nonempty_map(cleaned_df .rename(columns={'Root':'__Root'}), '__Root')

    # Build segmentation maps (string) from both frames
    gold_seg_series = []
    if not gold_df.empty:
        _gdf = gold_df.copy()
        _gdf['__Seg'] = _gdf.apply(_seg_str_from_row, axis=1)
        gold_seg_map = _first_nonempty_map(_gdf, '__Seg')
    else:
        gold_seg_map = {}

    cleaned_seg_series = []
    if not cleaned_df.empty:
        _cldf = cleaned_df.copy()
        _cldf['__Seg'] = _cldf.apply(_seg_str_from_row, axis=1)
        cleaned_seg_map = _first_nonempty_map(_cldf, '__Seg')
    else:
        cleaned_seg_map = {}

    # Words present across all three datasets (surface overlap)
    words_all_three = gold_words.intersection(cleaned_words).intersection(corpus_words)
    print(f"Surface-overlap across all three datasets: {len(words_all_three):,} words")

    rows = []
    kept = 0
    for w in words_all_three:
        # Roots from each source
        c_root = first_four_graphemes_root(w) or ''
        r_gold = gold_root_map.get(w, '')
        r_clean = cleaned_root_map.get(w, '')

        # Keep only if non-empty, equal, and in the triple-overlap root set
        if c_root and r_gold and r_clean and (c_root == r_gold == r_clean) and (c_root in roots_all_three):
            # Prefer cleaned segmentation; fallback to gold
            seg = cleaned_seg_map.get(w, '') or gold_seg_map.get(w, '')
            if seg:
                rows.append({'Word': w, 'Morph_split': seg})
                kept += 1

    word_level_gold_df = pd.DataFrame(rows).sort_values('Word')
    word_level_gold_df.to_csv(COMMON_WORDS_OUTPUT_FILE, index=False, encoding='utf-8')
    print(f"-> Saved {kept:,} rows to '{COMMON_WORDS_OUTPUT_FILE}' (columns: Word, Morph_split)\n")

Processing corpus file: data\qu_merged_dump.txt...
Corpus processed. Total unique valid grapheme-words (vocabulary size): 155,616

ANALYSIS RESULTS

--- 1. Corpus Coverage Analysis (surface forms) ---
[data\Sue_kalt.parquet]: Found 2,105 / 6,896 words in corpus (30.52% coverage).

[data\cleaned_data_df.csv]: Found 392 / 913 words in corpus (42.94% coverage).

--- 2. Dataset Incongruity Analysis (surface forms) ---
Words common to BOTH datasets: 89
Words in the corpus AND common to both datasets: 85
Words ONLY in 'data\Sue_kalt.parquet': 6,807
Words ONLY in 'data\cleaned_data_df.csv': 824

--- 2b. Root-Level Analysis (as requested) ---
Unique roots in CORPUS (first 4 graphemes): 14,282
Unique roots in data\Sue_kalt.parquet (first segment): 1,357
Unique roots in data\cleaned_data_df.csv (first segment): 376
Overlapping roots (gold ∩ cleaned): 125
Overlapping roots (gold ∩ corpus): 408
Overlapping roots (cleaned ∩ corpus): 157
Overlapping roots (gold ∩ cleaned ∩ corpus): 78

--- 3. Rarity

In [14]:
"""
PART 2: SYNTHETIC DATA GENERATION USING GPT MODELS
==================================================

This cell generates synthetic morphological segmentations using GPT-4o and GPT-5-mini.
It uses few-shot learning with gold standard examples to guide the models.

The process:
1. Loads gold standard examples and identifies words needing segmentation
2. For each model (gpt4o and gpt5mini):
   - Constructs few-shot prompts with examples
   - Calls the API to get segmentations
   - Handles rate limits and errors gracefully
3. Saves results to separate files for each model in the data folder

IMPORTANT: Different models may require different API parameters
---------------------------------------------------------------
If you encounter parameter errors (400 Bad Request), you may need to adjust the
get_model_params() function. Some models (especially newer ones) may not support
all standard parameters like top_p, frequency_penalty, or presence_penalty.

To fix parameter errors:
1. Check the error message - it will indicate which parameter is invalid
2. Edit the get_model_params() function to remove unsupported parameters for that model
3. For GPT-5 models, you may need to use only: max_tokens and temperature
4. Consult OpenAI API documentation for the specific model you're using
"""

import pandas as pd
import os
import regex as re
from collections import Counter
import time
import random
from tqdm import tqdm
from dotenv import load_dotenv

# Note: You must install the 'openai' library for this script to work.
# Run: pip install openai
import time, random
from openai import OpenAI
from openai import RateLimitError, APIError, APITimeoutError, APIConnectionError
import ast

# =========================
# DATA FOLDER CONFIGURATION
# =========================
DATA_FOLDER = "data"

# =========================
# CONFIGURATION
# =========================
# --- Input File Paths (read from data folder) ---
CORPUS_FILE = os.path.join(DATA_FOLDER, "qu_merged_dump.txt")
GOLD_DF_FILE = os.path.join(DATA_FOLDER, "Sue_kalt.parquet")  # Gold standard dataset (formerly called combined_df)
CLEANED_DF_FILE = os.path.join(DATA_FOLDER, "cleaned_data_df.csv")
GOLD_DATA_FILE = os.path.join(DATA_FOLDER, "word_analysis_gold.csv")  # High-quality examples for few-shot learning

# --- Output File Names (save to data folder) ---
# Files will be named based on the model used
OUTPUT_FILE_GPT4O = os.path.join(DATA_FOLDER, "gpt4o_synthetic_segmentations.csv")
OUTPUT_FILE_GPT5MINI = os.path.join(DATA_FOLDER, "gpt5mini_synthetic_segmentations.csv")

# --- API & Analysis Parameters ---
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Models to use for synthetic data generation
# Note: Model names must match OpenAI API model identifiers
# Common model names:
#   - "gpt-4o" (standard GPT-4o)
#   - "gpt-4o-mini" (smaller, faster version)
#   - "gpt-5-mini" or "gpt-5mini" (if available in your API)
# Check OpenAI API documentation for current model names and their supported parameters
# Different models may require different API parameters - see get_model_params() function
MODELS_TO_PROCESS = ["gpt-4o", "gpt-5-mini"]  # Process both models
# Alternative: Use "gpt-4o-mini" if "gpt-5-mini" is not available:
# MODELS_TO_PROCESS = ["gpt-4o", "gpt-4o-mini"]

# --- Few-Shot Learning Parameters ---
NUM_FEW_SHOT_EXAMPLES = 37  # How many examples to show the model in each prompt
WORDS_TO_PROCESS_LIMIT = 5  # Set a limit to avoid high API costs during testing. Set to None to process all.

# =========================
# QUECHUA GRAPHEMES + HELPERS
# =========================
# Quechua uses a specific set of graphemes (letters and letter combinations)
# These are used to validate words and extract roots for filtering
graphemes = [
    "ch","ll","rr","tr","kw","ph",  # digraphs/trigraphs (multi-character graphemes)
    "a","b","d","e","f","g","h","i","k","l","m","n","ñ","o","p","q",
    "r","s","t","u","v","w","x","y"
]
# Sort by length (longest first) for greedy matching
GRAPHEMES_BY_LEN = sorted(graphemes, key=len, reverse=True)

def tokenize_graphemes(word: str):
    """
    Greedy longest-match tokenizer over the allowed graphemes.
    Returns a list of graphemes if fully tokenized, else None.
    """
    if not isinstance(word, str):
        return None
    w = word.strip().lower()
    if not w: 
        return None
    # Apostrophes are not in the inventory -> reject
    if "'" in w or "’" in w:
        return None
    i = 0
    toks = []
    n = len(w)
    while i < n:
        matched = False
        for g in GRAPHEMES_BY_LEN:
            L = len(g)
            if i + L <= n and w[i:i+L] == g:
                toks.append(g)
                i += L
                matched = True
                break
        if not matched:
            return None
    return toks

def first_four_graphemes_root(word: str) -> str:
    """
    Root for corpus words: concatenation of the first 4 graphemes.
    Returns '' if not tokenizable.
    """
    toks = tokenize_graphemes(word)
    if not toks:
        return ''
    return ''.join(toks[:4])

def robust_first_segment(row, prefer_list_col="Morph_split", fallback_str_col="Morph_split_str", alt_morph_col="morph"):
    """
    For gold_df / cleaned_df: extract the 'root' as the first segment.
    - If Morph_split is a list -> take [0]
    - If Morph_split is a string representation of a list -> literal_eval then [0]
    - Else try splitting Morph_split_str (space)
    - Else if 'morph' is present, replace hyphens with spaces and take first token
    Returns '' if not found.
    """
    # Try Morph_split as actual list
    if prefer_list_col in row:
        val = row[prefer_list_col]
        if isinstance(val, list) and val:
            return str(val[0]).strip()
        if isinstance(val, str):
            s = val.strip()
            # Attempt to parse as list
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list) and parsed:
                    return str(parsed[0]).strip()
            except Exception:
                # treat as plain string with spaces
                if s:
                    return s.split()[0].strip()

    # Try Morph_split_str
    if fallback_str_col in row:
        s = row[fallback_str_col]
        if isinstance(s, str) and s.strip():
            return s.strip().split()[0]

    # Try 'morph' column (hyphen-separated)
    if alt_morph_col in row:
        m = row[alt_morph_col]
        if isinstance(m, str) and m.strip():
            return m.replace('-', ' ').strip().split()[0]

    return ''

# =========================
# HELPER FUNCTIONS (I/O + filtering)
# =========================
def load_all_data():
    """
    Loads all necessary data files and identifies words that need segmentation.
    
    This function:
    1. Loads the gold standard dataset (high-quality examples for few-shot learning)
    2. Loads existing segmented datasets (gold_df and cleaned_df)
    3. Processes the corpus to find words not yet segmented
    4. Filters words to only those whose roots are common across all three datasets
    
    Root definitions:
      - Corpus root = first 4 graphemes of the word
      - Gold/cleaned root = first segment from morphological segmentation
    
    This filtering ensures we only segment words that are likely to be valid
    and consistent with existing data.
    
    Returns:
        gold_df: DataFrame with gold standard examples
        words_to_segment: List of words that need segmentation
    """
    print("--- Step 1: Loading all data files ---")

    # ---- GOLD (few-shot) ----
    if not os.path.exists(GOLD_DATA_FILE):
        raise FileNotFoundError(f"Gold data file not found: '{GOLD_DATA_FILE}'. Please run the previous script first.")
    gold_df = pd.read_csv(GOLD_DATA_FILE)

    # Ensure Morph_split_str exists and is usable
    if 'Morph_split_str' not in gold_df.columns:
        gold_df['Morph_split_str'] = ''
    def _mk_str(val):
        if isinstance(val, list):
            return ' '.join(map(str, val))
        if isinstance(val, str):
            s = val.strip()
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return ' '.join(map(str, parsed))
            except Exception:
                # Already a plain string of splits
                return s
        return ''
    if 'Morph_split' in gold_df.columns:
        gold_df['Morph_split_str'] = gold_df['Morph_split'].apply(_mk_str)
    print(f"Loaded {len(gold_df):,} 'gold' examples for few-shot learning.")

    # ---- EXISTING SEGMENTED DATASETS ----
    # Read full files (not just 'Word') so we can derive roots robustly
    combined_df = pd.read_parquet(COMBINED_DF_FILE)
    combined_df['Word'] = combined_df['word']
    combined_df['morph'] = combined_df['morph'].str.replace('-', ' ')
    combined_df['Morph_split_str'] = combined_df['morph']
    combined_df['Morph_split'] = combined_df['morph'].str.split(' ')
    combined_df = combined_df[['Word', 'Morph_split', 'Morph_split_str']]
    cleaned_df  = pd.read_csv(CLEANED_DF_FILE,  encoding='windows-1252')

    # Normalize helpful columns if missing
    if 'Morph_split_str' not in combined_df.columns and 'Morph_split' in combined_df.columns:
        def _to_str_split(val):
            if isinstance(val, list):
                return ' '.join(map(str, val))
            if isinstance(val, str):
                try:
                    parsed = ast.literal_eval(val)
                    if isinstance(parsed, list):
                        return ' '.join(map(str, parsed))
                except Exception:
                    return val
            return ''
        combined_df['Morph_split_str'] = combined_df['Morph_split'].apply(_to_str_split) if 'Morph_split' in combined_df.columns else ''

    if 'Morph_split_str' not in cleaned_df.columns and 'Morph_split' in cleaned_df.columns:
        def _to_str_split2(val):
            if isinstance(val, list):
                return ' '.join(map(str, val))
            if isinstance(val, str):
                try:
                    parsed = ast.literal_eval(val)
                    if isinstance(parsed, list):
                        return ' '.join(map(str, parsed))
                except Exception:
                    return val
            return ''
        cleaned_df['Morph_split_str'] = cleaned_df['Morph_split'].apply(_to_str_split2) if 'Morph_split' in cleaned_df.columns else ''

    # Build sets of existing words
    existing_words = set(combined_df['Word'].dropna()) | set(cleaned_df['Word'].dropna())
    print(f"Found {len(existing_words):,} unique words across existing datasets.")

    # ---- CORPUS WORDS (unique) ----
    print("Reading full corpus to find target words...")
    if not os.path.exists(CORPUS_FILE):
        raise FileNotFoundError(f"Corpus file not found: {CORPUS_FILE}")
    TOKEN_RE = re.compile(r"[^\W\d_]+(?:['’][^\W\d_]+)?", flags=re.UNICODE)
    with open(CORPUS_FILE, "r", encoding="utf-8", errors="ignore") as f:
        corpus_text = f.read().lower()
    corpus_words_all = set(TOKEN_RE.findall(corpus_text))
    print(f"Found {len(corpus_words_all):,} unique words in the corpus.")

    # -----------------------------
    # ROOTS FOR ALL THREE DATASETS
    # -----------------------------
    # Corpus roots via first 4 graphemes (skip words that don't tokenize)
    corpus_roots = set()
    for w in corpus_words_all:
        r = first_four_graphemes_root(w)
        if r:
            corpus_roots.add(r)

    # Combined_df roots via first segment
    combined_roots = set()
    if not combined_df.empty:
        combined_df = combined_df.copy()
        combined_df['__root__'] = combined_df.apply(
            lambda row: robust_first_segment(row, "Morph_split", "Morph_split_str", "morph"), axis=1
        )
        combined_roots = set([r for r in combined_df['__root__'].dropna().map(str).map(str.strip) if r])

    # Cleaned_df roots via first segment
    cleaned_roots = set()
    if not cleaned_df.empty:
        cleaned_df = cleaned_df.copy()
        cleaned_df['__root__'] = cleaned_df.apply(
            lambda row: robust_first_segment(row, "Morph_split", "Morph_split_str", "morph"), axis=1
        )
        cleaned_roots = set([r for r in cleaned_df['__root__'].dropna().map(str).map(str.strip) if r])

    # Intersection of roots present in ALL THREE
    common_roots_all_three = corpus_roots.intersection(gold_roots).intersection(cleaned_roots)
    print(f"Roots common to all three datasets: {len(common_roots_all_three):,}")

    # --------------------------------------------
    # Determine corpus words needing segmentation
    # --------------------------------------------
    # Only words not already in existing datasets...
    candidate_words = sorted(list(corpus_words_all - existing_words))
    print(f"-> Initially identified {len(candidate_words):,} new corpus words (not in existing datasets).")

    # ...and whose corpus-root (first 4 graphemes) is in the intersection across all three datasets
    words_to_segment = []
    for w in candidate_words:
        root = first_four_graphemes_root(w)
        if root and root in common_roots_all_three:
            words_to_segment.append(w)

    print(f"-> Filtered to {len(words_to_segment):,} words whose roots are common to all three datasets.\n")

    return gold_df, words_to_segment

def construct_few_shot_prompt(target_word, gold_df, num_examples):
    """
    Creates a detailed prompt for the API with few-shot examples.
    """
    examples = gold_df.sample(n=min(num_examples, len(gold_df)), random_state=random.randint(0, 10_000))

    system_message = (
        "You are an expert in Quechua linguistics. Your task is to segment a given Quechua word into its constituent morphemes. "
        "The morphemes should be separated by spaces. Please provide only the segmented output, with no additional explanation or commentary."
    )

    messages = [{"role": "system", "content": system_message}]
    for _, row in examples.iterrows():
        # fallbacks in case Morph_split_str wasn't constructed above for some row
        s = row.get('Morph_split_str', '')
        if not isinstance(s, str) or not s.strip():
            s = ''
            if 'Morph_split' in row and isinstance(row['Morph_split'], str):
                try:
                    parsed = ast.literal_eval(row['Morph_split'])
                    if isinstance(parsed, list):
                        s = ' '.join(map(str, parsed))
                except Exception:
                    s = row['Morph_split']
        messages.append({"role": "user", "content": str(row['Word'])})
        messages.append({"role": "assistant", "content": s})

    messages.append({"role": "user", "content": target_word})
    return messages

def get_model_params(model_name):
    """
    Returns model-specific API parameters.
    Different models support different parameters, so we need to customize them.
    
    Args:
        model_name: Name of the model (e.g., "gpt-4o", "gpt-5-mini", "gpt-4o-mini")
    
    Returns:
        Dictionary of parameters to use for the API call (excluding 'model' and 'messages' which are set separately)
    """
    
    # Model-specific parameter configurations
    if "gpt-5" in model_name.lower() or "gpt5" in model_name.lower():
        # GPT-5 models may support reasoning_effort and verbosity
        # Note: Adjust these based on actual API documentation
        # Some GPT-5 models may not support all standard parameters
        return {
            # GPT-5 specific parameters
            "reasoning_effort": "minimal",  # Options: "minimal", "low", "medium", "high"
            "verbosity": "low",  # Options: "low", "medium", "high"
        }
    elif "gpt-4o-mini" in model_name.lower() or "gpt-4o-mini" in model_name:
        # GPT-4o-mini uses standard parameters (same as GPT-4o)
        return {
            "max_tokens": 50,
            "temperature": 0,
            "top_p": 1,
            "frequency_penalty": 0,
            "presence_penalty": 0,
        }
    else:
        # Default for GPT-4o and other standard models
        return {
            "max_tokens": 50,
            "temperature": 0,
            "top_p": 1,
            "frequency_penalty": 0,
            "presence_penalty": 0,
        }

def get_llm_segmentation(prompt_messages, model_name, retries=3, delay=5):
    """
    Calls the LLM API to get the word segmentation, with robust rate-limit handling.
    Retries on 429s (RateLimitError) and transient network/timeouts with exponential backoff + jitter.
    
    This function automatically adjusts API parameters based on the model being used.
    Different models (GPT-4o, GPT-5-mini, etc.) may require different parameters.
    
    Args:
        prompt_messages: List of message dictionaries for the API
        model_name: Name of the model to use (e.g., "gpt-4o", "gpt-5-mini", "gpt-4o-mini")
        retries: Number of retry attempts
        delay: Base delay in seconds for exponential backoff
    
    Returns:
        Segmented word string, or "[API_FAILED]" if all retries fail
    """
    def _retry_after_seconds(err, fallback):
        # Respect Retry-After header if the server provides it
        try:
            resp = getattr(err, "response", None)
            if resp and getattr(resp, "headers", None):
                ra = resp.headers.get("retry-after") or resp.headers.get("Retry-After")
                if ra:
                    return float(ra)
        except Exception:
            pass
        return fallback

    # Get model-specific parameters
    api_params = get_model_params(model_name)
    api_params["model"] = model_name
    api_params["messages"] = prompt_messages

    for attempt in range(retries):
        try:
            # Create API call with model-specific parameters
            response = client.chat.completions.create(**api_params)
            return (response.choices[0].message.content or "").strip()

        except RateLimitError as e:
            # 429 rate limit -> exponential backoff + jitter, honoring Retry-After if present
            base = delay * (2 ** attempt)
            wait = _retry_after_seconds(e, base) + random.uniform(0, 0.5)
            print(f"  [RateLimit] Hit 429. Waiting {wait:.2f}s before retry {attempt+1}/{retries}...")
            time.sleep(wait)

        except (APITimeoutError, APIConnectionError) as e:
            # Transient connectivity/timeouts
            wait = delay * (2 ** attempt) + random.uniform(0, 0.5)
            print(f"  [Transient] {type(e).__name__}: {e}. Waiting {wait:.2f}s (retry {attempt+1}/{retries})...")
            time.sleep(wait)

        except APIError as e:
            # Some APIError instances may also be 429; treat similarly
            status = getattr(e, "status_code", None)
            if status == 429:
                base = delay * (2 ** attempt)
                wait = _retry_after_seconds(e, base) + random.uniform(0, 0.5)
                print(f"  [API 429] Waiting {wait:.2f}s before retry {attempt+1}/{retries}...")
                time.sleep(wait)
            elif status == 400:
                # 400 Bad Request often means invalid parameters
                error_msg = str(e)
                if "parameter" in error_msg.lower() or "invalid" in error_msg.lower():
                    print(f"  [API Parameter Error] Model '{model_name}' may not support some parameters.")
                    print(f"  Error: {error_msg}")
                    print(f"  Suggestion: Check get_model_params() function and adjust parameters for this model.")
                    print(f"  You may need to remove top_p, frequency_penalty, or presence_penalty for this model.")
                else:
                    print(f"  [API Error 400]: {e}")
                # Don't retry on parameter errors - they won't be fixed by retrying
                break
            else:
                # Non-retryable API errors: surface and stop
                print(f"  [API Error] {status}: {e}")
                break

        except Exception as e:
            # Unknown error: log and stop retrying
            print(f"  [Unhandled Error]: {e}")
            break

    return "[API_FAILED]"

# =========================
# MAIN EXECUTION
# =========================
# Process synthetic data generation for both GPT-4o and GPT-5-mini
if __name__ == "__main__":
    if not os.environ.get("OPENAI_API_KEY"):
        print("FATAL ERROR: The 'OPENAI_API_KEY' environment variable is not set.")
        print("Please set it before running the script.")
    else:
        # Step 1: Load data and find words to process (filtered by common roots)
        print("="*70)
        print("SYNTHETIC DATA GENERATION FOR QUECHUA MORPHOLOGY")
        print("="*70)
        gold_df, words_to_segment = load_all_data()

        # Apply limit if one is set
        if WORDS_TO_PROCESS_LIMIT is not None:
            print(f"\n--- Applying processing limit: selecting {WORDS_TO_PROCESS_LIMIT} words randomly. ---")
            if len(words_to_segment) > WORDS_TO_PROCESS_LIMIT:
                words_to_segment = random.sample(words_to_segment, WORDS_TO_PROCESS_LIMIT)
            else:
                print("Limit is larger than the number of available words. Processing all.")

        # Step 2: Process words using each model
        # Map model names to output file paths
        model_output_map = {
            "gpt-4o": OUTPUT_FILE_GPT4O,
            "gpt-5-mini": OUTPUT_FILE_GPT5MINI
        }

        for model_name in MODELS_TO_PROCESS:
            if model_name not in model_output_map:
                print(f"Warning: Unknown model '{model_name}', skipping...")
                continue
                
            output_file = model_output_map[model_name]
            print(f"\n{'='*70}")
            print(f"Processing {len(words_to_segment):,} words using '{model_name}'")
            print(f"{'='*70}")
            
            results = []
            for word in tqdm(words_to_segment, desc=f"Segmenting with {model_name}"):
                prompt = construct_few_shot_prompt(word, gold_df, NUM_FEW_SHOT_EXAMPLES)
                segmented_word = get_llm_segmentation(prompt, model_name)
                results.append({
                    'Original_Word': word,
                    'Segmented_Morphemes': segmented_word,
                    'Source': f'LLM_FewShot_{model_name}',
                    'Model': model_name
                })

            # Step 3: Save results to a CSV file (filter out API failures)
            print(f"\n--- Saving results for {model_name} ---")
            results_df = pd.DataFrame(results)
            # Filter out API failures
            results_df = results_df[results_df['Segmented_Morphemes'] != '[API_FAILED]']
            results_df.to_csv(output_file, index=False, encoding='utf-8')
            print(f"✅ Successfully processed {len(results_df)} words with {model_name}.")
            print(f"   Results saved to '{output_file}'")
            print(f"   Failed API calls: {len(results) - len(results_df)}")
        
        print(f"\n{'='*70}")
        print("SYNTHETIC DATA GENERATION COMPLETE")
        print(f"{'='*70}")
        print(f"Generated segmentations for {len(words_to_segment):,} words using {len(MODELS_TO_PROCESS)} models.")
        print(f"Output files saved to {DATA_FOLDER}/")


SYNTHETIC DATA GENERATION FOR QUECHUA MORPHOLOGY
--- Step 1: Loading all data files ---
Loaded 37 'gold' examples for few-shot learning.
Found 7,720 unique words across existing datasets.
Reading full corpus to find target words...
Found 208,684 unique words in the corpus.
Roots common to all three datasets: 78
-> Initially identified 206,081 new corpus words (not in existing datasets).
-> Filtered to 24,343 words whose roots are common to all three datasets.


--- Applying processing limit: selecting 5 words randomly. ---

Processing 5 words using 'gpt-4o'


Segmenting with gpt-4o: 100%|██████████| 5/5 [00:05<00:00,  1.06s/it]



--- Saving results for gpt-4o ---
✅ Successfully processed 5 words with gpt-4o.
   Results saved to 'data\gpt4o_synthetic_segmentations.csv'
   Failed API calls: 0

Processing 5 words using 'gpt-5-mini'


Segmenting with gpt-5-mini: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]


--- Saving results for gpt-5-mini ---
✅ Successfully processed 5 words with gpt-5-mini.
   Results saved to 'data\gpt5mini_synthetic_segmentations.csv'
   Failed API calls: 0

SYNTHETIC DATA GENERATION COMPLETE
Generated segmentations for 5 words using 2 models.
Output files saved to data/





In [15]:
"""
TEST/ALTERNATIVE VERSION: PROMPT GENERATION FOR MANUAL TESTING
================================================================

This cell is an alternative approach for testing prompt generation.
It creates prompts that can be manually tested or used for batch processing.
This is useful for debugging or when you want to see the prompts before sending to API.

Note: This cell is for testing purposes. Use Cell 1 for actual synthetic data generation.
"""

import pandas as pd
import os
import regex as re
from collections import Counter
import time
import random
from tqdm import tqdm
from dotenv import load_dotenv

# Note: You must install the 'openai' library for this script to work.
# Run: pip install openai
import time, random
from openai import OpenAI
from openai import RateLimitError, APIError, APITimeoutError, APIConnectionError
import ast

# =========================
# DATA FOLDER CONFIGURATION
# =========================
DATA_FOLDER = "data"

# =========================
# CONFIGURATION
# =========================
# --- Input File Paths (read from data folder) ---
CORPUS_FILE = os.path.join(DATA_FOLDER, "qu_merged_dump.txt")
GOLD_DF_FILE = os.path.join(DATA_FOLDER, "Sue_kalt.parquet")  # Gold standard dataset (formerly called combined_df)
CLEANED_DF_FILE = os.path.join(DATA_FOLDER, "cleaned_data_df.csv")
GOLD_DATA_FILE = os.path.join(DATA_FOLDER, "word_analysis_gold.csv")  # High-quality examples for few-shot learning

# --- Few-Shot Learning Parameters ---
NUM_FEW_SHOT_EXAMPLES = 37  # How many examples to show the model in each prompt
WORDS_TO_PROCESS_LIMIT = 10  # Set a limit to avoid high API costs during testing. Set to None to process all.

# =========================
# QUECHUA GRAPHEMES + HELPERS
# =========================
graphemes = [
    "ch","ll","rr","tr","kw","ph",  # digraphs/trigraphs
    "a","b","d","e","f","g","h","i","k","l","m","n","ñ","o","p","q",
    "r","s","t","u","v","w","x","y"
]
GRAPHEMES_BY_LEN = sorted(graphemes, key=len, reverse=True)

def tokenize_graphemes(word: str):
    """
    Greedy longest-match tokenizer over the allowed graphemes.
    Returns a list of graphemes if fully tokenized, else None.
    """
    if not isinstance(word, str):
        return None
    w = word.strip().lower()
    if not w: 
        return None
    # Apostrophes are not in the inventory -> reject
    if "'" in w or "’" in w:
        return None
    i = 0
    toks = []
    n = len(w)
    while i < n:
        matched = False
        for g in GRAPHEMES_BY_LEN:
            L = len(g)
            if i + L <= n and w[i:i+L] == g:
                toks.append(g)
                i += L
                matched = True
                break
        if not matched:
            return None
    return toks

def first_four_graphemes_root(word: str) -> str:
    """
    Root for corpus words: concatenation of the first 4 graphemes.
    Returns '' if not tokenizable.
    """
    toks = tokenize_graphemes(word)
    if not toks:
        return ''
    return ''.join(toks[:4])

def robust_first_segment(row, prefer_list_col="Morph_split", fallback_str_col="Morph_split_str", alt_morph_col="morph"):
    """
    For gold_df / cleaned_df: extract the 'root' as the first segment.
    - If Morph_split is a list -> take [0]
    - If Morph_split is a string representation of a list -> literal_eval then [0]
    - Else try splitting Morph_split_str (space)
    - Else if 'morph' is present, replace hyphens with spaces and take first token
    Returns '' if not found.
    """
    # Try Morph_split as actual list
    if prefer_list_col in row:
        val = row[prefer_list_col]
        if isinstance(val, list) and val:
            return str(val[0]).strip()
        if isinstance(val, str):
            s = val.strip()
            # Attempt to parse as list
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list) and parsed:
                    return str(parsed[0]).strip()
            except Exception:
                # treat as plain string with spaces
                if s:
                    return s.split()[0].strip()

    # Try Morph_split_str
    if fallback_str_col in row:
        s = row[fallback_str_col]
        if isinstance(s, str) and s.strip():
            return s.strip().split()[0]

    # Try 'morph' column (hyphen-separated)
    if alt_morph_col in row:
        m = row[alt_morph_col]
        if isinstance(m, str) and m.strip():
            return m.replace('-', ' ').strip().split()[0]

    return ''

# =========================
# HELPER FUNCTIONS (I/O + filtering)
# =========================
def load_all_data():
    """
    Loads all necessary data files and identifies words that need segmentation.
    NOW: Filters words_to_segment to only those whose corpus roots are COMMON
    to all three datasets (corpus, gold_df, cleaned_df), where:
      - corpus root = first 4 graphemes
      - gold/cleaned root = first segment
    """
    print("--- Step 1: Loading all data files ---")

    # ---- GOLD (few-shot) ----
    if not os.path.exists(GOLD_DATA_FILE):
        raise FileNotFoundError(f"Gold data file not found: '{GOLD_DATA_FILE}'. Please run the previous script first.")
    gold_df = pd.read_csv(GOLD_DATA_FILE)

    # Ensure Morph_split_str exists and is usable
    if 'Morph_split_str' not in gold_df.columns:
        gold_df['Morph_split_str'] = ''
    def _mk_str(val):
        if isinstance(val, list):
            return ' '.join(map(str, val))
        if isinstance(val, str):
            s = val.strip()
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return ' '.join(map(str, parsed))
            except Exception:
                # Already a plain string of splits
                return s
        return ''
    if 'Morph_split' in gold_df.columns:
        gold_df['Morph_split_str'] = gold_df['Morph_split'].apply(_mk_str)
    print(f"Loaded {len(gold_df):,} 'gold' examples for few-shot learning.")

    # ---- EXISTING SEGMENTED DATASETS ----
    # Read full files (not just 'Word') so we can derive roots robustly
    combined_df = pd.read_parquet(COMBINED_DF_FILE)
    combined_df['Word'] = combined_df['word']
    combined_df['morph'] = combined_df['morph'].str.replace('-', ' ')
    combined_df['Morph_split_str'] = combined_df['morph']
    combined_df['Morph_split'] = combined_df['morph'].str.split(' ')
    combined_df = combined_df[['Word', 'Morph_split', 'Morph_split_str']]
    cleaned_df  = pd.read_csv(CLEANED_DF_FILE,  encoding='windows-1252')

    # Normalize helpful columns if missing
    if 'Morph_split_str' not in combined_df.columns and 'Morph_split' in combined_df.columns:
        def _to_str_split(val):
            if isinstance(val, list):
                return ' '.join(map(str, val))
            if isinstance(val, str):
                try:
                    parsed = ast.literal_eval(val)
                    if isinstance(parsed, list):
                        return ' '.join(map(str, parsed))
                except Exception:
                    return val
            return ''
        combined_df['Morph_split_str'] = combined_df['Morph_split'].apply(_to_str_split) if 'Morph_split' in combined_df.columns else ''

    if 'Morph_split_str' not in cleaned_df.columns and 'Morph_split' in cleaned_df.columns:
        def _to_str_split2(val):
            if isinstance(val, list):
                return ' '.join(map(str, val))
            if isinstance(val, str):
                try:
                    parsed = ast.literal_eval(val)
                    if isinstance(parsed, list):
                        return ' '.join(map(str, parsed))
                except Exception:
                    return val
            return ''
        cleaned_df['Morph_split_str'] = cleaned_df['Morph_split'].apply(_to_str_split2) if 'Morph_split' in cleaned_df.columns else ''

    # Build sets of existing words
    existing_words = set(combined_df['Word'].dropna()) | set(cleaned_df['Word'].dropna())
    print(f"Found {len(existing_words):,} unique words across existing datasets.")

    # ---- CORPUS WORDS (unique) ----
    print("Reading full corpus to find target words...")
    if not os.path.exists(CORPUS_FILE):
        raise FileNotFoundError(f"Corpus file not found: {CORPUS_FILE}")
    TOKEN_RE = re.compile(r"[^\W\d_]+(?:['’][^\W\d_]+)?", flags=re.UNICODE)
    with open(CORPUS_FILE, "r", encoding="utf-8", errors="ignore") as f:
        corpus_text = f.read().lower()
    corpus_words_all = set(TOKEN_RE.findall(corpus_text))
    print(f"Found {len(corpus_words_all):,} unique words in the corpus.")

    # -----------------------------
    # ROOTS FOR ALL THREE DATASETS
    # -----------------------------
    # Corpus roots via first 4 graphemes (skip words that don't tokenize)
    corpus_roots = set()
    for w in corpus_words_all:
        r = first_four_graphemes_root(w)
        if r:
            corpus_roots.add(r)

    # Combined_df roots via first segment
    combined_roots = set()
    if not combined_df.empty:
        combined_df = combined_df.copy()
        combined_df['__root__'] = combined_df.apply(
            lambda row: robust_first_segment(row, "Morph_split", "Morph_split_str", "morph"), axis=1
        )
        combined_roots = set([r for r in combined_df['__root__'].dropna().map(str).map(str.strip) if r])

    # Cleaned_df roots via first segment
    cleaned_roots = set()
    if not cleaned_df.empty:
        cleaned_df = cleaned_df.copy()
        cleaned_df['__root__'] = cleaned_df.apply(
            lambda row: robust_first_segment(row, "Morph_split", "Morph_split_str", "morph"), axis=1
        )
        cleaned_roots = set([r for r in cleaned_df['__root__'].dropna().map(str).map(str.strip) if r])

    # Intersection of roots present in ALL THREE
    common_roots_all_three = corpus_roots.intersection(combined_roots).intersection(cleaned_roots)
    print(f"Roots common to all three datasets: {len(common_roots_all_three):,}")

    # --------------------------------------------
    # Determine corpus words needing segmentation
    # --------------------------------------------
    # Only words not already in existing datasets...
    candidate_words = sorted(list(corpus_words_all - existing_words))
    print(f"-> Initially identified {len(candidate_words):,} new corpus words (not in existing datasets).")

    # ...and whose corpus-root (first 4 graphemes) is in the intersection across all three datasets
    words_to_segment = []
    for w in candidate_words:
        root = first_four_graphemes_root(w)
        if root and root in common_roots_all_three:
            words_to_segment.append(w)

    print(f"-> Filtered to {len(words_to_segment):,} words whose roots are common to all three datasets.\n")

    return gold_df, words_to_segment

def construct_few_shot_prompt(target_word, gold_df, num_examples):
    """
    Creates a detailed prompt for the API with few-shot examples.
    """
    examples = gold_df.sample(n=min(num_examples, len(gold_df)), random_state=random.randint(0, 10_000))

    prompt = """You are an expert in Quechua linguistics. Your task is to segment a given Quechua word into its constituent morphemes.
    The morphemes should be separated by spaces. Please provide only the segmented output, with no additional explanation or commentary.
    
    Examples:
    """

    for _, row in examples.iterrows():
        # fallbacks in case Morph_split_str wasn't constructed above for some row
        s = row.get('Morph_split_str', '')
        if not isinstance(s, str) or not s.strip():
            s = ''
            if 'Morph_split' in row and isinstance(row['Morph_split'], str):
                try:
                    parsed = ast.literal_eval(row['Morph_split'])
                    if isinstance(parsed, list):
                        s = ' '.join(map(str, parsed))
                except Exception:
                    s = row['Morph_split']
        prompt += f"{row['Word']}: {s}\n"

    prompt += f"{target_word}: "
    return prompt


# =========================
# MAIN EXECUTION
# =========================
if __name__ == "__main__":
    if not os.environ.get("OPENAI_API_KEY"):
        print("FATAL ERROR: The 'OPENAI_API_KEY' environment variable is not set.")
        print("Please set it before running the script.")
    else:
        # Step 1: Load gold data and all words
        gold_df, _ = load_all_data()

        # Load Sue Kalt dataset (gold_df)
        gold_df = pd.read_parquet(GOLD_DF_FILE)
        gold_df['Word'] = gold_df['word']
        gold_df['morph'] = gold_df['morph'].str.replace('-', ' ')
        gold_df['Morph_split_str'] = gold_df['morph']
        gold_df['Morph_split'] = gold_df['morph'].str.split(' ')

        # Step 2: Select few-shot examples
        examples = gold_df.sample(
            n=min(NUM_FEW_SHOT_EXAMPLES, len(gold_df)),
            random_state=random.randint(0, 10_000)
        )
        few_shot_words = set(examples['Word'].tolist())

        # Step 3: Pick 10 Sue Kalt words not in examples
        sue_candidates = gold_df[~gold_df['Word'].isin(few_shot_words)]
        words_to_segment = sue_candidates.sample(
            n=min(WORDS_TO_PROCESS_LIMIT, len(sue_candidates)),
            random_state=random.randint(0, 10_000)
        )

        print(f"--- Selected {len(words_to_segment)} words from Sue Kalt not in few-shot examples. ---")

        # Step 4: Build few-shot prompt
        prompt = """You are an expert in Quechua linguistics. Your task is to segment a given Quechua word into its constituent morphemes.
The morphemes should be separated by spaces. Please provide only the segmented output, with no additional explanation or commentary.

Examples:\n"""
        for _, row in examples.iterrows():
            s = row.get('Morph_split_str', '')
            if not isinstance(s, str) or not s.strip():
                s = ''
                if 'Morph_split' in row and isinstance(row['Morph_split'], str):
                    try:
                        parsed = ast.literal_eval(row['Morph_split'])
                        if isinstance(parsed, list):
                            s = ' '.join(map(str, parsed))
                    except Exception:
                        s = row['Morph_split']
            prompt += f"{row['Word']}: {s}\n"

        prompt += "\nWords to Segment:\n"
        for word in tqdm(words_to_segment['Word'], desc="Segmenting Words"):
            prompt += f"{word}\n"

        # Step 5: Prepare gold segmentations string for evaluation
        gold_segmentations_str = "\n".join(
            f"{row['Word']}: {row['Morph_split_str']}" for _, row in words_to_segment.iterrows()
        )

        # Output
        print("\n===== FEW-SHOT PROMPT =====\n")
        print(prompt)
        print("\n===== GOLD SEGMENTATIONS (for eval) =====\n")
        print(gold_segmentations_str)




--- Step 1: Loading all data files ---
Loaded 37 'gold' examples for few-shot learning.
Found 7,720 unique words across existing datasets.
Reading full corpus to find target words...
Found 208,684 unique words in the corpus.
Roots common to all three datasets: 78
-> Initially identified 206,081 new corpus words (not in existing datasets).
-> Filtered to 24,343 words whose roots are common to all three datasets.

--- Selected 10 words from Sue Kalt not in few-shot examples. ---


Segmenting Words: 100%|██████████| 10/10 [00:00<?, ?it/s]


===== FEW-SHOT PROMPT =====

You are an expert in Quechua linguistics. Your task is to segment a given Quechua word into its constituent morphemes.
The morphemes should be separated by spaces. Please provide only the segmented output, with no additional explanation or commentary.

Examples:
mamantachá: mama n ta chá
tio: tio
pukllakuchkanman: puklla ku chka nman
wawitawanchu: waw ita wan chu
bañakuychik: baña ku ychik
pilkanturaqa: pilkantura qa
hinanchu: hina n chu
haytachkan: hayta chka n
wakninpitaq: wak ni n pi taq
kaypi: kay pi
cambian: cambia n
purichkan: puri chka n
parlankuchá: parla nku chá
wawakuna: wawa kuna
suyachipun: suya chi pu n
imitatapis: im ita ta pis
urqukunayan: urqu ku naya n
harkayukuspa: harka yu ku spa
yaykuptiyki: yayku pti yki
manchariptin: mancha ri pti n
kaysitus: kay situ s
cuñaday: cuñada y
así: así
chitiqa: chiti qa
qallarikunña: qallari ku n ña
rikhuriyunku: rikhuri yu nku
rikhuripusqa: rikhuri pu sqa
pasan: pasa n
kirullañachá: kiru lla ña chá
ovejata


