### Import necessary libraries


In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer
import os

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Load the data


In [2]:
# Load the data
data_path = '../data/telegram_data.csv'
df = pd.read_csv(data_path)

# Display the first few rows
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4192,የፊታችን አርብ ጥር 9 ከምንፈታቸው የጂም ታይት መካከል ለሳምፕል,2025-01-14 19:21:55+00:00,
1,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4190,የፊታችን አርብ ጥር 9 ከምንፈታቸው የበጋ ክር ሹራብ መካከል ለሳምፕል,2025-01-14 19:15:26+00:00,
2,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4189,የፊታችን 🔥ዓርብ🔥 ጥር 9 👌\n\nውድ የሀይሚ ቦንዳ ቤተሰቦች የዛሬ ሳም...,2025-01-13 14:36:42+00:00,
3,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4188,💪ውድ የሀይሚ ቦንዳ ቤተሰቦች የቲክቶክ አድራሻችን ላይ ለሳምፕል ብዙ ቩድ...,2025-01-10 11:26:52+00:00,
4,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4187,,2025-01-08 18:06:52+00:00,


In [3]:
# Initialize the pre-trained tokenizer (multilingual, supports Amharic)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [4]:
# Tokenization function for Amharic text
def tokenize_amharic(text):
    return tokenizer.tokenize(text)

In [5]:
def clean_amharic_text(text):
    if not isinstance(text, str):
        text = ''  # Convert non-string values (e.g., NaN, float) to empty string
    # Replace unwanted characters, normalize spaces, etc.
    text = re.sub(r'[^\\w\\s]', '', text)  # Removing special characters
    text = re.sub(r'\\s+', ' ', text).strip()  # Normalize whitespace
    return text

###  Preprocess Amharic text

In [6]:
def preprocess_data(df):
    df['Message'] = df['Message'].fillna('')  ## Handle NaN values
    df['cleaned_text'] = df['Message'].apply(clean_amharic_text)
    df['tokens'] = df['cleaned_text'].apply(tokenize_amharic)
    return df


### Run the preprocessing


In [7]:
df_preprocessed = preprocess_data(df)
df_preprocessed.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,cleaned_text,tokens
0,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4192,የፊታችን አርብ ጥር 9 ከምንፈታቸው የጂም ታይት መካከል ለሳምፕል,2025-01-14 19:21:55+00:00,,,[]
1,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4190,የፊታችን አርብ ጥር 9 ከምንፈታቸው የበጋ ክር ሹራብ መካከል ለሳምፕል,2025-01-14 19:15:26+00:00,,,[]
2,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4189,የፊታችን 🔥ዓርብ🔥 ጥር 9 👌\n\nውድ የሀይሚ ቦንዳ ቤተሰቦች የዛሬ ሳም...,2025-01-13 14:36:42+00:00,,,[]
3,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4188,💪ውድ የሀይሚ ቦንዳ ቤተሰቦች የቲክቶክ አድራሻችን ላይ ለሳምፕል ብዙ ቩድ...,2025-01-10 11:26:52+00:00,,swwwsws,"[▁s, www, s, ws]"
4,ሀይሚ ቦንዳ/Haymi Bonda,Yebonda_libsoch,4187,,2025-01-08 18:06:52+00:00,,,[]


### Save the preprocessed data for later tasks

In [8]:
# Save the preprocessed data for later tasks
output_path = '../data/preprocessed_telegram_data.csv'
df_preprocessed.to_csv(output_path, index=False)

print("Preprocessing complete. Data saved to", output_path)

Preprocessing complete. Data saved to ../data/preprocessed_telegram_data.csv
