In [1]:
import re
import json
import itertools

In [2]:
# Load your JSON data
with open("telegram_ecommerce_data_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [3]:
# Pre-compiled patterns for entity types
price_patterns = [
    re.compile(r"(ዋጋ\s*)?(\d{1,3}(,\d{3})*|\d+)(\s*ሚሊዮን|\s*ብር)?"),
    re.compile(r"(Price\s*)(\d{1,3}(,\d{3})*|\d+)(\s*Million|\s*birr)?", re.IGNORECASE)
]
location_keywords = ["አያት", "ቦሌ", "ሃዋሳ", "አዲስ", "ባህርዳር", "ጋለን", "ሀዋሳ", "ዋና"]
product_keywords = ["መኪና", "Land", "Cruise", "Mark2", "ሌክሰስ", "ኮንዶሚንየም", "ሱቅ", "ቤት", "ሳሎን", "ማንኛውም", "ምርት"]

In [4]:
def label_tokens(tokens):
    labels = ["O"] * len(tokens)

    # PRICE
    joined = " ".join(tokens)
    for pattern in price_patterns:
        for match in pattern.finditer(joined):
            start, end = match.span()
            matched_tokens = joined[start:end].split()
            for i, token in enumerate(matched_tokens):
                idx = find_token_index(tokens, token, i)
                if idx != -1:
                    labels[idx] = "B-PRICE" if i == 0 else "I-PRICE"

    # LOCATION
    for i, token in enumerate(tokens):
        for loc in location_keywords:
            if loc in token:
                labels[i] = "B-LOC" if labels[i] == "O" else labels[i]

    # PRODUCT
    for i, token in enumerate(tokens):
        for prod in product_keywords:
            if prod in token:
                labels[i] = "B-Product" if labels[i] == "O" else labels[i]

    return list(zip(tokens, labels))

In [5]:
def find_token_index(tokens, word, start=0):
    for i in range(start, len(tokens)):
        if tokens[i] == word:
            return i
    return -1

In [6]:
def tokenize(text):
    # Basic whitespace and punctuation splitting
    tokens = re.findall(r'\w+|[^\w\s]', text)
    return tokens

In [7]:
# Process and label first 50 messages
output_lines = []
for entry in itertools.islice(data, 50):
    text = entry.get("clean_text", "")
    tokens = tokenize(text)
    labeled = label_tokens(tokens)
    for token, label in labeled:
        output_lines.append(f"{token} {label}")
    output_lines.append("")  # Sentence separator

In [8]:
# Saved output in CoNLL format
with open("labeled_dataset.conll", "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))