In [None]:
# EthioMart/notebooks/ner_model_training.ipynb

# --- Section 1: Setup and Configuration ---

# 1.1 Import necessary libraries
import pandas as pd
from pathlib import Path
import spacy
from spacy.tokens import DocBin
import random
import os
import sys

# Add the project root to the system path to allow importing from src and config
project_root = Path.cwd().parent # This assumes you run the notebook from EthioMart/notebooks/
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import the custom labeler and configuration
try:
    from src.data_labeler import AmharicRuleBasedLabeler
    from config.config import DATA_DIR
    print("✅ Successfully imported AmharicRuleBasedLabeler and DATA_DIR.")
except ImportError as e:
    print(f"❌ Error importing modules: {e}")
    print("Please ensure your `src` directory and `config.config` are set up correctly.")
    # Fallback paths for local testing if config import fails
    DATA_DIR = Path("../data") # Fallback for data directory

# Define paths for input and output
# Input: The CoNLL file generated by data_labeler.py
CONLL_INPUT_PATH = DATA_DIR / "annotated" / "telegram_ner_data_rule_based.conll"

# Output: Binary SpaCy format (.spacy) for training
SPACY_TRAIN_DATA_PATH = DATA_DIR / "processed" / "train_ner_data.spacy"
SPACY_DEV_DATA_PATH = DATA_DIR / "processed" / "dev_ner_data.spacy"
SPACY_TEST_DATA_PATH = DATA_DIR / "processed" / "test_ner_data.spacy"


# --- Section 2: Generate CoNLL Data (if not already done) ---

print("\n--- Section 2: Generate CoNLL Data ---")
if not CONLL_INPUT_PATH.exists() or os.path.getsize(CONLL_INPUT_PATH) == 0:
    print(f"❗ {CONLL_INPUT_PATH} not found or is empty. Running rule-based labeler...")
    
    # Ensure the input CSV for the labeler exists
    cleaned_csv_path = DATA_DIR / "processed" / "clean_telegram_data.csv"
    if not cleaned_csv_path.exists():
        print(f"❌ Error: Cleaned data CSV not found at {cleaned_csv_path}. Please run preprocessor.py first.")
    else:
        labeler = AmharicRuleBasedLabeler()
        success = labeler.process_to_conll(
            input_csv_path=str(cleaned_csv_path),
            output_conll_path=str(CONLL_INPUT_PATH),
            sample_size=None # Use None for full dataset, or a number for testing
        )
        if success:
            print("✅ CoNLL data generation complete.")
        else:
            print("❌ CoNLL data generation failed. Check logs above.")
else:
    print(f"✅ CoNLL data found at {CONLL_INPUT_PATH}. Skipping regeneration.")

# --- Section 3: Load and Inspect CoNLL Data ---

print("\n--- Section 3: Load and Inspect CoNLL Data ---")

# A simple function to read CoNLL-like data
def read_conll(file_path):
    sentences = []
    current_sentence = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line: # If line is not empty
                    parts = line.split('\t')
                    if len(parts) == 2:
                        current_sentence.append((parts[0], parts[1]))
                    else:
                        print(f"Warning: Skipping malformed line in CoNLL file: '{line}'")
                else: # Empty line signifies end of a sentence
                    if current_sentence:
                        sentences.append(current_sentence)
                        current_sentence = []
            if current_sentence: # Add the last sentence if file doesn't end with a newline
                sentences.append(current_sentence)
        print(f"Loaded {len(sentences)} sentences from {file_path}")
        return sentences
    except FileNotFoundError:
        print(f"Error: CoNLL file not found at {file_path}")
        return []
    except Exception as e:
        print(f"Error reading CoNLL file {file_path}: {e}")
        return []

# Load the data
all_sentences = read_conll(CONLL_INPUT_PATH)

if not all_sentences:
    print("No sentences loaded. Cannot proceed with data splitting and training setup.")
else:
    # Display a sample sentence with its tags
    print("\n--- Sample Annotated Sentence ---")
    for i, (token, tag) in enumerate(all_sentences[0]):
        if i < 10: # Display first 10 tokens for brevity
            print(f"{token}\t{tag}")
        elif i == 10:
            print("...")


# --- Section 4: Convert to SpaCy DocBin and Split Data ---

print("\n--- Section 4: Convert to SpaCy DocBin and Split Data ---")

# Initialize a blank SpaCy model for tokenization
nlp_blank = spacy.blank("xx")

# Convert sentences to SpaCy Doc objects
docs = []
for sent_tokens_tags in tqdm(all_sentences, desc="Converting to SpaCy Docs"):
    words = [item[0] for item in sent_tokens_tags]
    tags = [item[1] for item in sent_tokens_tags]
    
    doc = nlp_blank(u" ".join(words)) # Create a doc from joined words, then re-align tokens
    
    # Re-align tokens with original words and assign entity tags
    # This is a bit tricky with spacy.blank. It's often better to create Doc from `words` and `spaces` directly.
    # For now, we'll iterate and set entities if they align perfectly to avoid complexity.
    # For robust NER training, it's critical that spans are correctly identified.
    
    entities = []
    # Simplified approach: for each token in the doc, if its text matches the original word
    # and has a tag, try to create an entity. This is not how `offsets_to_biluo_tags`
    # normally works; it's inverse. We already have BILUO tags.
    # What we need to do is apply the BILUO tags to a new Doc.

    # Rebuilding Doc with entities from BILUO tags for proper SpaCy training format
    # This requires `doc.from_json` or manually adding spans.
    # A cleaner way using DocBin:
    
    # Create an empty Doc and manually add tokens and NER spans
    doc = nlp_blank.make_doc(" ".join(words)) # This creates the tokens based on the joined words
    
    # We now need to re-align the original tokens from the CoNLL data with the new SpaCy doc's tokens
    # and then set the entities based on the BILUO tags.
    # This is non-trivial if the tokenizer introduces new splits.
    
    # A robust way is to use `spacy.tokens.Span` and `doc.set_ents`
    # However, for converting CoNLL to DocBin, spacy's `conllu` format is easier,
    # or if we have (text, ents) tuples:
    
    # Let's re-think: The `offsets_to_biluo_tags` approach is typically used to get tags from (text, ents)
    # Here, we *have* tokens and BILUO tags. We need to convert this *back* into Spacy's (text, entities) format
    # for the DocBin.
    
    # The `data_labeler.py` already gave us `(start_offset, end_offset, label)`
    # We need to create a list of (text, {"entities": [(start, end, label)]}) for DocBin.
    
    # We should re-parse the CoNLL file to get (original_text, entities_list) for `DocBin.add(nlp(text), entities=entities_list)`
    # Or, the `read_conll` function should return something more useful.
    
    # Let's adjust `read_conll` to produce a list of (text, entities)
    # The current `read_conll` produces list of [("token", "TAG")]
    
    # Instead of re-reading, let's use the original logic from `data_labeler.py`
    # and transform it into a format suitable for SpaCy's DocBin.

    # We need a list of (text, annotations) where annotations is a dict like {"entities": [(start, end, label)]}
    # This conversion logic should ideally be *part* of the labeler script,
    # or a separate helper function.

    # For this notebook, let's assume we can regenerate (text, ents) from the CoNLL,
    # which is less direct. The most robust way is to re-run the `extract_entities`
    # from the original preprocessed texts to get correct offsets.
    
    # Re-loading preprocessed text and regenerating entities for SpaCy DocBin
    # This is safer than trying to reverse-engineer offsets from CoNLL.
    df_clean = pd.read_csv(DATA_DIR / "processed" / "clean_telegram_data.csv", encoding='utf-8')
    df_clean_filtered = df_clean[df_clean['preprocessed_text'].notna() & (df_clean['preprocessed_text'] != '')]
    
    labeler_instance = AmharicRuleBasedLabeler() # Re-initialize the labeler for its patterns
    
    training_data = []
    for _, row in tqdm(df_clean_filtered.iterrows(), total=len(df_clean_filtered), desc="Preparing SpaCy training data"):
        text = str(row['preprocessed_text'])
        entities = labeler_instance._extract_entities_from_text(text)
        
        # SpaCy DocBin expects entities in (start, end, label) format
        training_data.append((text, {"entities": entities}))

    print(f"Prepared {len(training_data)} samples for SpaCy training.")

    # Split the data into training, development (validation), and test sets
    random.seed(42) # For reproducibility
    random.shuffle(training_data)

    train_ratio = 0.8
    dev_ratio = 0.1
    
    train_size = int(len(training_data) * train_ratio)
    dev_size = int(len(training_data) * dev_ratio)

    train_data = training_data[:train_size]
    dev_data = training_data[train_size : train_size + dev_size]
    test_data = training_data[train_size + dev_size :]

    print(f"Train samples: {len(train_data)}")
    print(f"Dev samples: {len(dev_data)}")
    print(f"Test samples: {len(test_data)}")

    # Convert to SpaCy's binary DocBin format
    def create_docbin(data, nlp_model):
        db = DocBin()
        for text, annot in tqdm(data, desc="Creating DocBin"):
            doc = nlp_model(text)
            ents = []
            for start, end, label in annot["entities"]:
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is None:
                    # This happens if character offsets don't perfectly align to tokens
                    # This is why rule-based regex and SpaCy tokenization need careful alignment
                    # For simple cases, 'contract' mode might help. More complex solutions might involve
                    # custom tokenization or manual span correction.
                    logging.warning(f"Skipping entity due to non-alignment: '{text[start:end]}' at {start}-{end} with label '{label}' in text '{text}'")
                else:
                    ents.append(span)
            doc.ents = ents
            db.add(doc)
        return db

    # Create DocBins
    # Use spacy.blank("xx") for creating docs, as we are only using it for tokenization
    # and creating annotated Doc objects, not for pre-trained entity recognition.
    nlp_for_docbin = spacy.blank("xx") 
    
    train_db = create_docbin(train_data, nlp_for_docbin)
    dev_db = create_docbin(dev_data, nlp_for_docbin)
    test_db = create_docbin(test_data, nlp_for_docbin)

    # Save DocBin files
    SPACY_TRAIN_DATA_PATH.parent.mkdir(parents=True, exist_ok=True) # Ensure dir exists
    train_db.to_disk(SPACY_TRAIN_DATA_PATH)
    dev_db.to_disk(SPACY_DEV_DATA_PATH)
    test_db.to_disk(SPACY_TEST_DATA_PATH)

    print(f"\n✅ SpaCy training data saved to:")
    print(f"- Train: {SPACY_TRAIN_DATA_PATH}")
    print(f"- Dev: {SPACY_DEV_DATA_PATH}")
    print(f"- Test: {SPACY_TEST_DATA_PATH}")

    # --- Section 5: Next Steps: SpaCy Training Command ---

    print("\n--- Section 5: Next Steps: SpaCy Training Command ---")
    print("You have now prepared your data for SpaCy NER model training.")
    print("To train a SpaCy NER model, you typically need a `config.cfg` file.")
    print("You can generate a base config file using `spacy init config`:")
    print("\nExample `spacy init config` command:")
    print(f"cd {project_root}")
    print(f"spacy init config ./config/config.cfg --lang xx --pipeline ner --optimize efficiency --force")
    print("\nAfter generating and potentially customizing `config.cfg`, you can train your model:")
    print("Example `spacy train` command:")
    print(f"spacy train ./config/config.cfg --output ./models/ner_model --paths.train {SPACY_TRAIN_DATA_PATH} --paths.dev {SPACY_DEV_DATA_PATH}")
    print("\nRemember to install all required dependencies for SpaCy training:")
    print("pip install spacy[transformers] # if using transformer-based models")
    print("pip install cupy-cuda11x # if you have a CUDA GPU and want GPU acceleration")
    print("\nOnce trained, your model will be saved in `./models/ner_model`.")
    print("You can then load it using `nlp = spacy.load('./models/ner_model/model-best')` for inference.")

