In [1]:
import pandas as pd
import re
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


# Load NER model fine-tuned on multilingual data
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
ner_pipeline = pipeline("ner", model=model_name, tokenizer=tokenizer, grouped_entities=False)

# Load your scraped Telegram data
df = pd.read_csv("../data/raw/telegram_data.csv")

# Sample up to 100 messages per channel
sampled_df = (
    df.dropna(subset=["Message"])
    .groupby("Channel Username", group_keys=False)
    .apply(lambda x: x.sample(n=100, random_state=42) if len(x) >= 100 else x)
    .reset_index(drop=True)
)

# Function to clean Amharic text
def clean_amharic_text(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF0-9A-Za-z፡።፣፤፥፦፧.,!?()\[\]\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Create output directory
os.makedirs("output", exist_ok=True)

# Final output lines in CoNLL format
output_lines = []

# Loop through each sampled message
for _, row in sampled_df.iterrows():
    raw_text = str(row["Message"])
    cleaned = clean_amharic_text(raw_text)
    words = cleaned.split()

    try:
        ner_result = ner_pipeline(cleaned)
    except Exception as e:
        print(f"⚠️ Skipping message due to error: {e}")
        continue

    # Build character-level label map
    char_labels = ["O"] * len(cleaned)
    for entity in ner_result:
        start = entity['start']
        end = entity['end']
        label = entity['entity']
        if label.startswith("B-"):
            char_labels[start] = label  # First character
            for i in range(start + 1, end):
                if i < len(char_labels) and char_labels[i] == "O":
                    char_labels[i] = "I-" + label[2:]

    # Now assign labels per word using majority char label
    position = 0
    output_lines.append(f"# {row['Channel Username']} | ID: {row['Message ID']}")
    for word in words:
        word_len = len(word)
        # Get character labels for this word
        word_char_labels = char_labels[position:position + word_len]
        if not word_char_labels:
            label = "O"
        else:
            # Majority label (excluding 'O' if possible)
            non_o_labels = [l for l in word_char_labels if l != "O"]
            label = non_o_labels[0] if non_o_labels else "O"
        output_lines.append(f"{word} {label}")
        position += word_len + 1  # Account for space
    output_lines.append("")

# Save to file
with open("output/auto_labeled_fixed.conll", "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))

print("✅ Fixed auto-labeling complete. Output saved to: output/auto_labeled_fixed.conll")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
  .apply(lambda x: x.sample(n=100, random_state=42) if len(x) >= 100 else x)


✅ Fixed auto-labeling complete. Output saved to: output/auto_labeled_fixed.conll
