In [2]:
import re
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/dataset_telegram.csv")
print("✅ Data loaded.")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Check data types and convert as needed
print("\n📊 Data types:")
print(df.dtypes)

✅ Data loaded.
Dataset shape: (35168, 6)
Columns: ['Channel Title', 'Channel Username', 'ID', 'Message', 'Date', 'Media Path']

📊 Data types:
Channel Title       object
Channel Username    object
ID                   int64
Message             object
Date                object
Media Path          object
dtype: object


In [3]:
# Convert data types
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Message'] = df['Message'].astype(str)
df['Channel Title'] = df['Channel Title'].astype(str)

print("✅ Data types converted.")
print("\n📊 Updated data types:")
print(df.dtypes)

✅ Data types converted.

📊 Updated data types:
Channel Title                    object
Channel Username                 object
ID                                int64
Message                          object
Date                datetime64[ns, UTC]
Media Path                       object
dtype: object


In [4]:
# Define text cleaning function
def clean_text(text):
    """
    Clean Amharic text by removing unwanted characters and normalizing
    """
    if pd.isna(text) or text == 'nan':
        return ""
    
    text = str(text)
    # Remove non-Amharic punctuations but keep Amharic punctuations
    text = re.sub(r'[^\w\s።፡፣፤፥፦፧፨]', '', text)
    # Remove English characters (but keep numbers for now as they might be prices)
    text = re.sub(r'[a-zA-Z]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply text cleaning
df['cleaned_text'] = df['Message'].apply(clean_text)

# Remove empty or very short messages
df = df[df['cleaned_text'].str.len() > 2]

print("✅ Text cleaning completed.")
print(f"Dataset shape after cleaning: {df.shape}")

# Show sample of cleaned text
print("\n📝 Sample cleaned messages:")
for i, (idx, row) in enumerate(df.head(3).iterrows()):
    print(f"{i+1}. Original: {row['Message'][:100]}...")
    print(f"   Cleaned:  {row['cleaned_text'][:100]}...")
    print()

✅ Text cleaning completed.
Dataset shape after cleaning: (19898, 7)

📝 Sample cleaned messages:
1. Original: 💥Miralux Hot plate
 ባለሁለት ምድጃ ስቶቭ

      💯orginal

⚡️ 2000 ዋት
⚡️ ፊውዝ የተገጠመለት
⚡️ ትልቅ ድስት መሸከም የሚችል
⚡️...
   Cleaned:  ባለሁለት ምድጃ ስቶቭ 2000 ዋት ፊውዝ የተገጠመለት ትልቅ ድስት መሸከም የሚችል አስተማማኝ ቴርሞስታት ባለ ፊውዝ ዋጋ፦ ትልቁ 2900ብር አድራሻ ቁ1 መገናኛ...

2. Original: 💥7pcs glass water set

✔️ አንድ ማራኪ ጆግና 6 መጠጫ ብርጭቆዎች
✔️ የፈሳሽ መጠጥ ማቅረቢያ
✔️ ከፍተኛ ሙቀት የሚቋቋም
✔️ ኳሊቲ ወፍራም

...
   Cleaned:  7 አንድ ማራኪ ጆግና 6 መጠጫ ብርጭቆዎች የፈሳሽ መጠጥ ማቅረቢያ ከፍተኛ ሙቀት የሚቋቋም ኳሊቲ ወፍራም ዋጋ፦ 3400 ብር ውስን ፍሬ ነው የቀረው አድራሻ ቁ1...

3. Original: 🎯 Universal water-saving dishwasher head

🔰Increase water outlet pressure and rinse efficiently.
🔰36...
   Cleaned:  360 100 ዋጋ፦ 400 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢሮ ቁ 05ከ ሊፍቱ ፊት ለ ፊት ...



In [5]:
# Save cleaned data
df.to_csv("../data/telegram_messages_cleaned.csv", index=False)
print("✅ Cleaned data saved to '../data/telegram_messages_cleaned.csv'")

# Create data directory if it doesn't exist
import os
os.makedirs("../data", exist_ok=True)

✅ Cleaned data saved to '../data/telegram_messages_cleaned.csv'


In [6]:
# Amharic tokenization and normalization
try:
    from etnltk.tokenize.am import word_tokenize
    from etnltk.lang.am import normalize
    
    print("✅ ETNLTK imported successfully")
    
    # Apply normalization
    def normalize_amharic(text):
        if pd.isna(text) or text == '' or text == 'nan':
            return ""
        try:
            return normalize(str(text))
        except Exception as e:
            print(f"Error normalizing text: {text[:50]}... Error: {e}")
            return str(text)
    
    df['tokenized_text'] = df['cleaned_text'].apply(normalize_amharic)
    
    print("✅ Amharic normalization completed using ETNLTK")
    
except ImportError as e:
    print(f"⚠️  ETNLTK not available: {e}")
    print("Using basic tokenization fallback...")
    
    # Fallback tokenization for Amharic
    def basic_amharic_tokenize(text):
        if pd.isna(text) or text == '' or text == 'nan':
            return ""
        
        text = str(text)
        # Basic Amharic word tokenization by splitting on spaces and punctuation
        tokens = re.split(r'[\s።፡፣፤፥፦፧፨]+', text)
        # Remove empty tokens
        tokens = [token.strip() for token in tokens if token.strip()]
        return ' '.join(tokens)
    
    df['tokenized_text'] = df['cleaned_text'].apply(basic_amharic_tokenize)
    print("✅ Basic tokenization completed")

# Clean up extra spaces
df['tokenized_text'] = df['tokenized_text'].str.replace(r'\s+', ' ', regex=True).str.strip()
df['tokenized_text'] = df["tokenized_text"].astype(str)

print(f"✅ Tokenization process completed")
print(f"Final dataset shape: {df.shape}")

# Show tokenization results
print("\n📝 Sample tokenized messages:")
for i, (idx, row) in enumerate(df.head(3).iterrows()):
    print(f"{i+1}. Cleaned:   {row['cleaned_text'][:80]}...")
    print(f"   Tokenized: {row['tokenized_text'][:80]}...")
    print()

✅ ETNLTK imported successfully
✅ Amharic normalization completed using ETNLTK
✅ Amharic normalization completed using ETNLTK
✅ Tokenization process completed
Final dataset shape: (19898, 8)

📝 Sample tokenized messages:
1. Cleaned:   ባለሁለት ምድጃ ስቶቭ 2000 ዋት ፊውዝ የተገጠመለት ትልቅ ድስት መሸከም የሚችል አስተማማኝ ቴርሞስታት ባለ ፊውዝ ዋጋ፦ ትልቁ...
   Tokenized: ባለሁለት ምድጃ ስቶቭ 2000 ዋት ፊውዝ የተገጠመለት ትልቅ ድስት መሸከም የሚችል አስተማማኝ ቴርሞስታት ባለ ፊውዝ ዋጋ፦ ትልቁ...

2. Cleaned:   7 አንድ ማራኪ ጆግና 6 መጠጫ ብርጭቆዎች የፈሳሽ መጠጥ ማቅረቢያ ከፍተኛ ሙቀት የሚቋቋም ኳሊቲ ወፍራም ዋጋ፦ 3400 ብር ውስ...
   Tokenized: 7 አንድ ማራኪ ጆግና 6 መጠጫ ብርጭቆዎች የፈሳሽ መጠጥ ማቅረቢያ ከፍተኛ ሙቀት የሚቋቋም ኳሊቲ ወፍራም ዋጋ፦ 3400 ብር ውስ...

3. Cleaned:   360 100 ዋጋ፦ 400 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢ...
   Tokenized: 360 100 ዋጋ፦ 400 ብር ውስን ፍሬ ነው ያለው አድራሻ ቁ1 መገናኛ ታሜ ጋስ ህንፃ ጎን ስሪ ኤም ሲቲ ሞል ሁለተኛ ፎቅ ቢ...

✅ Tokenization process completed
Final dataset shape: (19898, 8)

📝 Sample tokenized messages:
1. Cleaned:   ባለሁለት ምድጃ ስቶቭ 2000 ዋት ፊውዝ የተገጠመለት ትልቅ ድስት መሸከም የሚችል አስተማማኝ ቴርሞስታት ባለ ፊውዝ ዋጋ፦ ትልቁ

In [7]:
# Save tokenized data
df.to_csv("../data/telegram_messages_tokenized.csv", index=False)
print("✅ Tokenized data saved to '../data/telegram_messages_tokenized.csv'")

# Create JSON output for further processing
json_data = df[['Channel Title', 'Message', 'cleaned_text', 'tokenized_text']].rename(
    columns={'cleaned_text': 'cleaned_message', 'tokenized_text': 'tokenized_message'}
)

json_data.to_json('../data/telegram_data.json', orient='records', force_ascii=False, lines=True)
print("✅ JSON data saved to '../data/telegram_data.json'")

# Final summary
print(f"\n📊 Processing Summary:")
print(f"- Original messages: {len(df)}")
print(f"- Processed messages: {len(df[df['tokenized_text'].str.len() > 0])}")
print(f"- Files created:")
print(f"  • telegram_messages_cleaned.csv")
print(f"  • telegram_messages_tokenized.csv") 
print(f"  • telegram_data.json")
print("\n🎉 Data preprocessing completed successfully!")

✅ Tokenized data saved to '../data/telegram_messages_tokenized.csv'
✅ JSON data saved to '../data/telegram_data.json'

📊 Processing Summary:
- Original messages: 19898
- Processed messages: 19898
- Files created:
  • telegram_messages_cleaned.csv
  • telegram_messages_tokenized.csv
  • telegram_data.json

🎉 Data preprocessing completed successfully!
✅ JSON data saved to '../data/telegram_data.json'

📊 Processing Summary:
- Original messages: 19898
- Processed messages: 19898
- Files created:
  • telegram_messages_cleaned.csv
  • telegram_messages_tokenized.csv
  • telegram_data.json

🎉 Data preprocessing completed successfully!
