### Importing the Necessary Packages

In [4]:
import pandas as pd
import re

### Loading the Dataset

In [5]:
df = pd.read_csv(r'C:\Users\Blen\OneDrive\Desktop\10Academy\LLM\data\cleaned_V6.csv')

In [6]:
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Date,Media Path,Messages_cleaned
0,ምርጥ ዕቃ,@MerttEka,6811,2024-09-27 18:14:53+00:00,photos\@MerttEka_6811.jpg,የእናት ጡት ወተት ማጠራቀሚያ\n ከውስጥ ልብስዎ ጋር አብረው የሚለብሱትና...
1,ምርጥ ዕቃ,@MerttEka,6810,2024-09-27 14:19:03+00:00,,በቻርጅ የሚሰራ የጃር ውሃ ፓምፕ\n ለሁሉም ዓይነት የውሃ ጃሮች ይሆ...
2,ምርጥ ዕቃ,@MerttEka,6806,2024-09-27 11:42:32+00:00,photos\@MerttEka_6806.jpg,የልብስ መተኮሻ\n በእንፏሎት የሚሰራ ባለ 3 ወፍራም ምሶሶ መቆሚያ ያለ...
3,ምርጥ ዕቃ,@MerttEka,6802,2024-09-26 16:16:51+00:00,photos\@MerttEka_6802.jpg,የመኪና መዓዛ\n በፀሃይ ብርሃን ስለሚሰራ ቻርጅ ማድረግ አይፈልግም\n ሁ...
4,ምርጥ ዕቃ,@MerttEka,6801,2024-09-26 12:31:45+00:00,photos\@MerttEka_6801.jpg,የፀጉር ማድረቂያ ፎን\n6000 በጣም ፈጣን\nማበጠሪያ ያለው\n ሶኬቱ ...


### A function to label the messages with CoNLL format

In [7]:
def label_message_utf8_with_birr(message):
    # Split the message at the first occurrence of '\n'
    if '\n' in message:
        first_line, remaining_message = message.split('\n', 1)
    else:
        first_line, remaining_message = message, ""
    
    labeled_tokens = []
    
    # Tokenize the first line
    first_line_tokens = re.findall(r'\S+', first_line)
    
    # Label the first token as B-PRODUCT and the rest as I-PRODUCT
    if first_line_tokens:
        labeled_tokens.append(f"{first_line_tokens[0]} B-PRODUCT")  # First token as B-PRODUCT
        for token in first_line_tokens[1:]:
            labeled_tokens.append(f"{token} I-PRODUCT")  # Remaining tokens as I-PRODUCT
    
    # Process the remaining message normally
    if remaining_message:
        lines = remaining_message.split('\n')
        for line in lines:
            tokens = re.findall(r'\S+', line)  # Tokenize each line while considering non-ASCII characters
            
            for token in tokens:
                # Check if token is a price (e.g., 500 ETB, $100, or ብር)
                if re.match(r'^\d{10,}$', token):
                    labeled_tokens.append(f"{token} O")  # Label as O for "other" or outside of any entity
                elif re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or 'ዋጋ' in token or '$' in token or 'ብር' in token:
                    labeled_tokens.append(f"{token} I-PRICE")
                # Check if token could be a location (e.g., cities or general location names)
                elif any(loc in token for loc in ['Addis Ababa', 'ለቡ', 'ለቡ መዳህኒዓለም', 'መገናኛ', 'ቦሌ', 'ሜክሲኮ']):
                    labeled_tokens.append(f"{token} B-LOC")
                elif any(loc in token for loc in ['ዘፍመሽ', 'ግራንድ','ሞል','3ኛ','ፎቅ']):
                    labeled_tokens.append(f"{token} I-LOC")
                # Assume other tokens are part of a product name or general text
                else:
                    labeled_tokens.append(f"{token} O")
    
    return "\n".join(labeled_tokens)

In [8]:
# Apply the updated function to the non-null messages
df['Labeled_Message'] = df['Messages_cleaned'].apply(label_message_utf8_with_birr)

In [9]:

# Display the updated DataFrame
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Date,Media Path,Messages_cleaned,Labeled_Message
0,ምርጥ ዕቃ,@MerttEka,6811,2024-09-27 18:14:53+00:00,photos\@MerttEka_6811.jpg,የእናት ጡት ወተት ማጠራቀሚያ\n ከውስጥ ልብስዎ ጋር አብረው የሚለብሱትና...,የእናት B-PRODUCT\nጡት I-PRODUCT\nወተት I-PRODUCT\nማ...
1,ምርጥ ዕቃ,@MerttEka,6810,2024-09-27 14:19:03+00:00,,በቻርጅ የሚሰራ የጃር ውሃ ፓምፕ\n ለሁሉም ዓይነት የውሃ ጃሮች ይሆ...,በቻርጅ B-PRODUCT\nየሚሰራ I-PRODUCT\nየጃር I-PRODUCT\...
2,ምርጥ ዕቃ,@MerttEka,6806,2024-09-27 11:42:32+00:00,photos\@MerttEka_6806.jpg,የልብስ መተኮሻ\n በእንፏሎት የሚሰራ ባለ 3 ወፍራም ምሶሶ መቆሚያ ያለ...,የልብስ B-PRODUCT\nመተኮሻ I-PRODUCT\nበእንፏሎት O\nየሚሰራ...
3,ምርጥ ዕቃ,@MerttEka,6802,2024-09-26 16:16:51+00:00,photos\@MerttEka_6802.jpg,የመኪና መዓዛ\n በፀሃይ ብርሃን ስለሚሰራ ቻርጅ ማድረግ አይፈልግም\n ሁ...,የመኪና B-PRODUCT\nመዓዛ I-PRODUCT\nበፀሃይ O\nብርሃን I-...
4,ምርጥ ዕቃ,@MerttEka,6801,2024-09-26 12:31:45+00:00,photos\@MerttEka_6801.jpg,የፀጉር ማድረቂያ ፎን\n6000 በጣም ፈጣን\nማበጠሪያ ያለው\n ሶኬቱ ...,የፀጉር B-PRODUCT\nማድረቂያ I-PRODUCT\nፎን I-PRODUCT\...


### Saving the labeled data

In [10]:
labeled_data_birr_path = 'labeled_telegram_data.txt'
with open(labeled_data_birr_path, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(f"{row['Labeled_Message']}\n\n")