In [1]:
import pandas as pd

# Load your cleaned CSV with tokenized messages
df = pd.read_csv("data/messages.csv", encoding="utf-8")

print(df.head())  # Inspect first rows


                  date                               chat        sender  \
0  2025-06-24 09:05:42                    ሐዋርያዊ መልሶች Chat          ነፂ 🥀   
1  2025-06-24 09:06:48                    ሐዋርያዊ መልሶች Chat     mekutii12   
2  2025-06-24 09:07:31  Tulips Event & sales 🤝 Xpand SMMA   TulipsEvent   
3  2025-06-24 09:09:08  Tulips Event & sales 🤝 Xpand SMMA   TulipsEvent   
4  2025-06-24 09:09:08                    ሐዋርያዊ መልሶች Chat  mklldouble_g   

                                             message  \
0  በገነት በነበረች በዕፀ ህይውት ፈንታ  በምድር ተተከለች ልትሆነው ለአዳም...   
1  እናት አግኝቼሻለው ከመስቀሉ እግርጌ ስሟን እየጠራሁ ከፍ እንዲል ማዕረጌ ...   
2  #NewArrival 😍Ladies Gift Combo 😍  Commission (...   
3   🟥Gucci ladies bag light brown color 🟥 Sold out 🟥   
4                                     እረፍት አታስዱም እንዴ   

                                     cleaned_message  \
0  በገነት በነበረች በዕፀ ህይውት ፈንታ በምድር ተተከለች ልትሆነው ለአዳም ...   
1  እናት አግኝቼሻለው ከመስቀሉ እግርጌ ስሟን እየጠራሁ ከፍ እንዲል ማዕረጌ ...   
2  newarrival ladies gift combo  commission 

In [2]:
import re

def clean_text(text):
    # Lowercase (if applicable) & remove unwanted characters
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def tokenize(text):
    return text.split()

# Example: preprocess and tokenize a column
df['cleaned_message'] = df['message'].apply(clean_text)
df['tokens'] = df['cleaned_message'].apply(tokenize)

print(df[['message', 'cleaned_message', 'tokens']].head())


                                             message  \
0  በገነት በነበረች በዕፀ ህይውት ፈንታ  በምድር ተተከለች ልትሆነው ለአዳም...   
1  እናት አግኝቼሻለው ከመስቀሉ እግርጌ ስሟን እየጠራሁ ከፍ እንዲል ማዕረጌ ...   
2  #NewArrival 😍Ladies Gift Combo 😍  Commission (...   
3   🟥Gucci ladies bag light brown color 🟥 Sold out 🟥   
4                                     እረፍት አታስዱም እንዴ   

                                     cleaned_message  \
0  በገነት በነበረች በዕፀ ህይውት ፈንታ  በምድር ተተከለች ልትሆነው ለአዳም...   
1  እናት አግኝቼሻለው ከመስቀሉ እግርጌ ስሟን እየጠራሁ ከፍ እንዲል ማዕረጌ ...   
2  newarrival ladies gift combo   commission ኮሚሽን...   
3      gucci ladies bag light brown color  sold out    
4                                     እረፍት አታስዱም እንዴ   

                                              tokens  
0  [በገነት, በነበረች, በዕፀ, ህይውት, ፈንታ, በምድር, ተተከለች, ልትሆ...  
1  [እናት, አግኝቼሻለው, ከመስቀሉ, እግርጌ, ስሟን, እየጠራሁ, ከፍ, እን...  
2  [newarrival, ladies, gift, combo, commission, ...  
3  [gucci, ladies, bag, light, brown, color, sold...  
4                                 [እረፍት, አታስዱም, እንዴ

In [3]:
with open("data/ner_raw.conll", "w", encoding="utf-8") as f:
    for tokens in df["tokens"]:
        for token in tokens:
            f.write(f"{token} O\n")
        f.write("\n")  # Blank line between messages


In [4]:
# Example annotated data format:
# A list of sentences, each sentence is a list of (token, label) tuples
annotated_data = [
    [("አፍሪካ", "B-ORG"), ("ሓየተየግማህበር", "I-ORG"), ("ማብቅያ", "O"), ("ቀን", "O")],
    [("አዳማ", "B-LOC"), ("ከተማ", "I-LOC")],
    [("ቴስፋ", "B-PER"), ("ባች", "I-PER")]
]

output_path = "data/ner_annotated.conll"

with open(output_path, "w", encoding="utf-8") as f:
    for sentence in annotated_data:
        for token, label in sentence:
            f.write(f"{token} {label}\n")
        f.write("\n")  # Blank line to separate sentences

print(f"Annotated data saved to {output_path}")


Annotated data saved to data/ner_annotated.conll


In [1]:
import spacy
from spacy.tokens import DocBin

def read_conll(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.read().strip().split("\n\n")
    data = []
    for block in lines:
        tokens = []
        tags = []
        for line in block.split("\n"):
            token, tag = line.split()
            tokens.append(token)
            tags.append(tag)
        data.append((tokens, tags))
    return data

def conll_to_spacy(data, nlp):
    doc_bin = DocBin()
    for tokens, tags in data:
        doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
        ents = []
        start = None
        end = None
        label = None
        for i, tag in enumerate(tags):
            if tag.startswith("B-"):
                if start is not None:
                    ents.append(spacy.tokens.Span(doc, start, end, label=label))
                start = i
                end = i + 1
                label = tag[2:]
            elif tag.startswith("I-") and start is not None:
                end = i + 1
            else:
                if start is not None:
                    ents.append(spacy.tokens.Span(doc, start, end, label=label))
                    start = None
                    end = None
                    label = None
        # catch last entity
        if start is not None:
            ents.append(spacy.tokens.Span(doc, start, end, label=label))
        doc.ents = ents
        doc_bin.add(doc)
    return doc_bin

nlp = spacy.blank("am")  # Assuming Amharic language model, or replace with "en" if English
data = read_conll("data/ner_annotated.conll")
doc_bin = conll_to_spacy(data, nlp)
doc_bin.to_disk("data/train.spacy")
print("Converted CoNLL to spaCy format")


Converted CoNLL to spaCy format
