In [6]:
import pandas as pd
import re
import os
import logging

# Load dataset
df = pd.read_csv(
    r"C:\Users\Almazt\OneDrive - Ethiopian Airlines\Desktop\10 Academy\EthioMart-Week 5\data/telegram_meneshayeofficial_clean_data.csv"
)

# Define a function to handle CoNLL format with multi-word support
def format_to_conll(df, entity_mapping):
    conll_data = []
    for _, row in df.iterrows():
        tokens = row["clean_text"].split()
        labels = ["O"] * len(tokens)  # Initialize with Outside (O)

        # Label entities in tokens
        for i, token in enumerate(tokens):
            if token in entity_mapping:
                label = entity_mapping[token]
                if label.startswith("B-"):  # Begin-entity
                    labels[i] = label
                elif label.startswith("I-") and i > 0 and labels[i - 1].startswith("B-"):
                    labels[i] = label

        # Add tokens and labels to CoNLL format
        conll_data.extend(list(zip(tokens, labels)))
        conll_data.append(("", ""))  # Separate sentences
        
    return conll_data

# Create clean_text column
clean_data = df[["clean_text"]]

# Entity mapping
entity_mapping = {
    # B-Tags
    "meneshayeofficial": "B-TELEGRAM",
    "ከመነሻዬ": "B-TELEGRAM",
    "httpstmegelaglewoch": "B-URL",
    "ጉርድ": "B-LOC",
    # I-Tags
    "ሾላ": "I-LOC",
    "ሲቲ": "I-LOC",
    "መደመር": "B-MATHSOPERATION",
    # Add other tags as needed
}

# Format the data
labeled_data = format_to_conll(clean_data, entity_mapping)

# Save to CoNLL format file
output_dir = "../data/"
os.makedirs(output_dir, exist_ok=True)  # Ensure directory exists

output_file = os.path.join(output_dir, "labeled_data.conll")

with open(output_file, "w", encoding="utf-8") as f:
    for token, label in labeled_data:
        f.write(f"{token} {label}\n" if token else "\n")

logging.info(f"Labeled data saved to: {output_file}")
