In [6]:
import pandas as pd
import re

In [9]:
# Load the CSV file into a DataFrame
cleaned_amharic_text = pd.read_csv('../../src/Data/clean_data/Leyueqa_clean_data.csv')

# Ensure the DataFrame is loaded correctly by displaying its columns
print("Columns in DataFrame:", cleaned_amharic_text.columns)

# Access the 'Cleaned_Message' column
lines = cleaned_amharic_text['Cleaned_Message']

# Normalize and process lines
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters (keep only Amharic characters)
    text = re.sub(r'[^ሀ-ሐ-መ-ዐ-ዔ-ፈ-ፌ\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply normalization to the first 40 messages
normalized_lines = [normalize_text(line) for line in lines[:40] if isinstance(line, str)]

# Process lines: separate tokens and labels
data = [line.strip().split() for line in normalized_lines if line.strip()]  # Split based on spaces
tokens = [item[0] for item in data]  # Extract tokens (first item of each line)
labels = [item[1] for item in data if len(item) > 1]  # Extract labels (second item if it exists)

# Print tokens and labels
print("Tokens:", tokens)
print("Labels:", labels)

Columns in DataFrame: Index(['Channel Title', 'Channel Username', 'ID', 'Message', 'Date',
       'Cleaned_Message'],
      dtype='object')
Tokens: ['ሞረድ', 'ቻናላችንን', 'ሊትር', 'ድስቶች', 'የሱሪ', 'ክል', 'የአኩሪ', 'ዋጋ', 'የመፀዳጃ', 'ቀነሰ', 'የችበስመጥበሻ', 'ድብርትን', 'መካኒካል', 'የንኬክ', 'ዘመናዊ', 'አነስተኛ', 'አነስተኛ', 'ክያዎንበሞባይልባንኪንግመፈፀምምይችላሉ', 'ሬ', 'የዉሃ', 'ባለብዙጥቅም', 'በቢምቢ', 'ዋጋ', 'የቡና', 'እና', 'ለ', 'ላስቲክ', 'ጉልበት', 'መብራት', 'ነፃ', 'የጁስ', 'ማራኪ', 'የሪጅዎን', 'ኪችንዎትን', 'የቤታችንን', 'ባለ', 'ዉስን', 'የተለያየ', 'ዘመናዊ', 'ነገ']
Labels: ['በተለያየ', 'ለጓደኛዎ', 'የሚይዙ', 'ወጥ', 'ማስቀመጫ', 'ያለዉ', 'አተር', 'ከነፃ', 'ቤት', 'ለፀጉሮ', 'የሚይዘው', 'የህመም', 'ቴላቴሊ', 'መስሪያ', 'የልብስ', 'የቤት', 'የቤት', 'በተጨማሪ', 'የዘይት', 'ጆክ', 'የመስታወት', 'አልተቸገሩም', 'ብር', 'ሪሪሆንግ', 'ሊትር', 'ተለያዩ', 'የምሳ', 'ያለው', 'ሲጠ', 'ዲሊቨሪ', 'የአትክልትሽንኩርት', 'እና', 'ቦታ', 'ውብ', 'ውበት', 'አንድ', 'የቢላ', 'መጠን', 'አዲስ', 'እሁድ']


In [None]:
class ProductCatalog:
    def __init__(self):
        # Product and location lists
        self.b_product = ['ሞረድ', 'ድስቶች', 'የሱሪ', 'የአኩሪ', 'የመፀዳጃ', 'የችበስመጥበሻ', 'የንኬክ', 'የዉሃ',  'የቡና', 'ላስቲክ', 'መብራት', 'የጁስ', 'ወጥ', 'ማስቀመጫ', 'ጆክ', 'የመስታወት', 'ሪሪሆንግ', 'የአትክልትሽንኩርት', 'የቢላ']

        self.b_location = []

    def search_product(self, query):
        """
        Search for products that match the query string using a regular expression.
        """
        pattern = re.compile(query)
        result = [product for product in self.b_product if pattern.search(product)]
        return result

    def search_location(self, query):
        """
        Search for locations that match the query string using a regular expression.
        """
        pattern = re.compile(query)
        result = [location for location in self.b_location if pattern.search(location)]
        return result

    def label_message_utf8_with_birr(self, messages):
        all_rows = []  # To hold rows of tokens and labels

        for message in messages:
            if pd.isna(message):  # Check for NaN values
                continue  # Skip NaN messages

            message = str(message)  # Ensure the message is a string

            # Split the message at the first occurrence of '\n'
            if '\n' in message:
                first_line, remaining_message = message.split('\n', 1)
            else:
                first_line, remaining_message = message, ""

            labeled_tokens = []

            # Tokenize the first line
            first_line_tokens = re.findall(r'\S+', first_line)

            # Label the first token as B-PRODUCT and the rest as I-PRODUCT
            if first_line_tokens:
                labeled_tokens.append((first_line_tokens[0], 'B-PRODUCT'))  # First token as B-PRODUCT
                for token in first_line_tokens[1:]:
                    labeled_tokens.append((token, 'I-PRODUCT'))  # Remaining tokens as I-PRODUCT

            # Process the remaining message normally
            if remaining_message:
                lines = remaining_message.split('\n')
                for line in lines:
                    tokens = re.findall(r'\S+', line)  # Tokenize each line

                    for token in tokens:
                        # Check if token is a price (e.g., 500 ETB, $100, or ብር)
                        if re.match(r'^\d{10,}$', token):
                            labeled_tokens.append((token, 'O'))  # Label as O for "other" or outside of any entity
                        elif re.match(r'^\d+(\.\d{1,2})?$', token) or 'ETB' in token or 'ዋጋ' in token or '$' in token or 'ብር' in token:
                            labeled_tokens.append((token, 'I-PRICE'))
                        # Check if token could be a location (e.g., cities or general location names)
                        elif any(loc in token for loc in ['ዲሊቨሪ']):
                            labeled_tokens.append((token, 'B-LOC'))
                        # Check if token could be a location (e.g., specific location names)
                        elif any(loc in token for loc in ['ደራርቱ', 'ህንፃ', 'ጎን', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ', 'ጊዮርጊስ', 'አደባባይ', 'ሱቅ', 'ራመት', 'ታቦር', 'ኦዳ', 'ህንፃ', 'ራመት_ታቦር_ኦዳ_ህንፃ', 'ራመት_ታቦር', 'ራመት_ታቦር_ኦዳ' 'ቅርንጫፍ']):
                            labeled_tokens.append((token, 'I-LOC'))
                        # Assume other tokens are part of a product name or general text
                        else:
                            labeled_tokens.append((token, 'O'))

            # Store tokens and their labels in a single row
            tokens_row = [token for token, label in labeled_tokens]
            labels_row = [label for token, label in labeled_tokens]
            all_rows.append({'Tokens': tokens_row, 'Labels': labels_row})

        # Create a DataFrame from all rows
        df = pd.DataFrame(all_rows)

        return df

# Example usage with your DataFrame
messages = cleaned_amharic_text['Cleaned_Message'].tolist()  # Convert the column to a list
catalog = ProductCatalog()
labeled_df = catalog.label_message_utf8_with_birr(messages)

# Print the output in table format
print(labeled_df)
# Display the DataFrame as a table
print("Tokens and Labels Table:")
for index, row in labeled_df.iterrows():
    print(f"Row {index + 1}:")
    print("Tokens:", row['Tokens'])
    print("Labels:", row['Labels'])
    print("-" * 50)  # Separator for better readability

                                                 Tokens  \
0     [ሞረድ, በተለያየ, መጠን, ቢላዎች, መቀስ, መከትከቻ, ከማስቀመጫው, ጋ...   
1            [ቻናላችንን, ለጓደኛዎ, ሸር, ማድረግዎን, አይርሱ, @, @, @]   
2     [ሊትር, የሚይዙ, ሁለት, ቋቶች, አንዱ, የብረት, አንዱ, ደግሞ, ወፍራ...   
3     [ድስቶች, +, ወጥ, ማቅረቢያ, +, መጥበሻ, የማይዙ, የማያሳርሩ, ለይ...   
4     [የሱሪ, ማስቀመጫ, የሱሪ, ማስቀመጫ, ሱሪዎ, ወይም, ልብስዎ, ባለበት,...   
...                                                 ...   
1274  [ኮድ, ዋጋ፦, ብር, ከነፃ, ዲሊቨሪ, ጋር, #ክፍያዎንበሞባይልባንኪንግመ...   
1275  [ኦሪጅናል, ላስቲክ, ዉስጥ, ያለዉን, አየር, ስቦ, ማሸግ, የሚችል, የ...   
1276  [ትክክለኛ, አድራሻችንን, በጎግል, ማፕ, ሊንኩን, ተጭነዉ, ሱቆቻችን, ...   
1277  [°, አድራሻዎቻችን, አአ, ቁጥር, ልደታ, ባልቻ, ሆስፒታል, ጀርባ, አ...   
1278                                                 []   

                                                 Labels  
0     [O, O, O, O, O, O, O, O, I-PRICE, I-PRICE, O, ...  
1                              [O, O, O, O, O, O, O, O]  
2     [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
3     [B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I...  
4

In [12]:
# Function to save the DataFrame in CoNLL format
def save_to_conll(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
            for token, label in zip(row['Tokens'], row['Labels']):
                f.write(f"{token} {label}\n")
            f.write("\n")  # Blank line to separate sentences/messages

# Example usage with your DataFrame
messages = cleaned_amharic_text['Cleaned_Message'].tolist()  # Convert the column to a list
catalog = ProductCatalog()
labeled_df = catalog.label_message_utf8_with_birr(messages)

# Save the labeled DataFrame to a CoNLL formatted file
conll_filename = 'LeyueqaLabeled_data.conll'
save_to_conll(labeled_df, conll_filename)

print(f"Data saved to {conll_filename} in CoNLL format.")

Data saved to LeyueqaLabeled_data.conll in CoNLL format.
