In [1]:
import pandas as pd

# Load the scraped Telegram data
df = pd.read_csv("../telegram_data.csv",encoding='utf-8')

# Show structure
print("Total rows:", len(df))
df.head(5)

Total rows: 2500


Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,Zemen Express®,@ZemenExpress,6990,,2025-06-21 08:07:31+00:00,photos\@ZemenExpress_6990.jpg
1,Zemen Express®,@ZemenExpress,6989,,2025-06-21 08:07:31+00:00,photos\@ZemenExpress_6989.jpg
2,Zemen Express®,@ZemenExpress,6988,,2025-06-21 08:07:31+00:00,photos\@ZemenExpress_6988.jpg
3,Zemen Express®,@ZemenExpress,6987,💥💥...................................💥💥\n\n3pc...,2025-06-21 08:07:31+00:00,photos\@ZemenExpress_6987.jpg
4,Zemen Express®,@ZemenExpress,6986,💥💥...................................💥💥\n\n3pc...,2025-06-21 08:07:11+00:00,


In [None]:
# performs data cleaning on your Amharic Telegram message dataset
import re

# Drop rows with empty messages
df = df.dropna(subset=['Message'])

# Remove duplicates (based on message text)
df = df.drop_duplicates(subset=['Message'])

# Function to clean Amharic messages
def clean_message(text):
    if not isinstance(text, str):
        return ""
    # Remove emojis and special characters (optional)
    text = re.sub(r'[^\w\s፡።፣፤፥፦፧፨ብርመዝናብዋትናን]', '', text)
    
    # Normalize multiple spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Apply cleaning function
df['Cleaned_Message'] = df['Message'].apply(clean_message)

# Show sample
df[['Message', 'Cleaned_Message']].head(5)

Unnamed: 0,Message,Cleaned_Message
3,💥💥...................................💥💥\n\n3pc...,3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት ...
5,💥💥...................................💥💥\n\n📌1 ...,1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር ውስ...
8,💥💥...................................💥💥\n\n📌Im...,Imitation Volcano Humidifier with LED Light በኤ...
9,💥💥...................................💥💥\n\n📌 B...,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...
17,💥💥...................................💥💥\n\n📌Sm...,Smart Usb Ultrasonic Car And Home Air Humidifi...


In [3]:
# Tokenize the cleaned Amharic message
def tokenize_amharic(text):
    # Basic whitespace tokenizer
    return text.split()

# Apply tokenization
df['Tokens'] = df['Cleaned_Message'].apply(tokenize_amharic)

# Preview tokens
df[['Cleaned_Message', 'Tokens']].head(5)

Unnamed: 0,Cleaned_Message,Tokens
3,3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት ...,"[3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ..."
5,1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር ውስ...,"[1, pairs, Sneaker, Crease, Protector, ዋጋ፦, 40..."
8,Imitation Volcano Humidifier with LED Light በኤ...,"[Imitation, Volcano, Humidifier, with, LED, Li..."
9,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...,"[Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ..."
17,Smart Usb Ultrasonic Car And Home Air Humidifi...,"[Smart, Usb, Ultrasonic, Car, And, Home, Air, ..."


In [4]:
# Select and reorder relevant columns
structured_df = df[[
    'Channel Title',
    'Channel Username',
    'Date',
    'Cleaned_Message',
    'Tokens',
    'Media Path'
]]

# Preview structure
structured_df.head()

Unnamed: 0,Channel Title,Channel Username,Date,Cleaned_Message,Tokens,Media Path
3,Zemen Express®,@ZemenExpress,2025-06-21 08:07:31+00:00,3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት ...,"[3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ...",photos\@ZemenExpress_6987.jpg
5,Zemen Express®,@ZemenExpress,2025-06-21 05:42:46+00:00,1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር ውስ...,"[1, pairs, Sneaker, Crease, Protector, ዋጋ፦, 40...",photos\@ZemenExpress_6985.jpg
8,Zemen Express®,@ZemenExpress,2025-06-18 06:01:10+00:00,Imitation Volcano Humidifier with LED Light በኤ...,"[Imitation, Volcano, Humidifier, with, LED, Li...",
9,Zemen Express®,@ZemenExpress,2025-06-16 12:21:00+00:00,Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...,"[Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ...",
17,Zemen Express®,@ZemenExpress,2025-06-16 05:11:57+00:00,Smart Usb Ultrasonic Car And Home Air Humidifi...,"[Smart, Usb, Ultrasonic, Car, And, Home, Air, ...",photos\@ZemenExpress_6973.jpg


In [None]:
# Export to raw CoNLL-style file for manual tagging
# exports your preprocessed Amharic text data into a CoNLL-style format

with open("../data/clean/unlabeled_conll.txt", "w", encoding='utf-8') as f:
    for tokens in df['Tokens']:
        for token in tokens:
            f.write(f"{token} O\n")  # Default tag: O
        f.write("\n")  # Blank line between sentences/messages

In [7]:
# performs text cleaning and tokenization on your Amharic Telegram messages
# preparing them for Named Entity Recognition (NER) tasks
import pandas as pd
import re


def clean_text(text):
    if not isinstance(text, str):
        return ''
    # Keep Amharic Unicode (\u1200-\u137F), Latin letters, numbers, and spaces only
    text = re.sub(r'[^\w\s\u1200-\u137F]', '', text)
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces
    return text.strip()

def tokenize(text):
    return text.split()

# Apply cleaning and tokenizing
df['Cleaned_Message'] = df['Message'].apply(clean_text)
df['Tokens'] = df['Cleaned_Message'].apply(tokenize)

# Check result
print(df[['Message', 'Cleaned_Message', 'Tokens']].head())

# Save the processed data for next steps
df.to_csv('../data/clean/processed_telegram_data.csv', index=False, encoding='utf-8')

                                              Message  \
3   💥💥...................................💥💥\n\n3pc...   
5   💥💥...................................💥💥\n\n📌1 ...   
8   💥💥...................................💥💥\n\n📌Im...   
9   💥💥...................................💥💥\n\n📌 B...   
17  💥💥...................................💥💥\n\n📌Sm...   

                                      Cleaned_Message  \
3   3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት ...   
5   1 pairs Sneaker Crease Protector ዋጋ፦ 400 ብር ውስ...   
8   Imitation Volcano Humidifier with LED Light በኤ...   
9   Baby Carrier በፈለጉት አቅጣጫ ልጅዎን በምቾት ማዘል ያስችልዎታል ...   
17  Smart Usb Ultrasonic Car And Home Air Humidifi...   

                                               Tokens  
3   [3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ...  
5   [1, pairs, Sneaker, Crease, Protector, ዋጋ፦, 40...  
8   [Imitation, Volcano, Humidifier, with, LED, Li...  
9   [Baby, Carrier, በፈለጉት, አቅጣጫ, ልጅዎን, በምቾት, ማዘል, ...  
17  [Smart, Usb, Ultrasonic, Car, 