### Import necessary libraries


In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer
import os

### Load the data


In [None]:
# Load the data
data_path = '../data/telegram_data.csv'
df = pd.read_csv(data_path)

# Display the first few rows
df.head()

In [None]:
# Initialize the pre-trained tokenizer (multilingual, supports Amharic)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [None]:
# Tokenization function for Amharic text
def tokenize_amharic(text):
    return tokenizer.tokenize(text)

In [10]:
def clean_amharic_text(text):
    if not isinstance(text, str):
        text = ''  # Convert non-string values (e.g., NaN, float) to empty string
    # Replace unwanted characters, normalize spaces, etc.
    text = re.sub(r'[^\\w\\s]', '', text)  # Removing special characters
    text = re.sub(r'\\s+', ' ', text).strip()  # Normalize whitespace
    return text

###  Preprocess Amharic text

In [None]:
def preprocess_data(df):
    df['Message'] = df['Message'].fillna('')  ## Handle NaN values
    df['cleaned_text'] = df['Message'].apply(clean_amharic_text)
    df['tokens'] = df['cleaned_text'].apply(tokenize_amharic)
    return df


### Run the preprocessing


In [None]:
df_preprocessed = preprocess_data(df)
df_preprocessed.head()

### Save the preprocessed data for later tasks

In [None]:
# Save the preprocessed data for later tasks
output_path = '../data/preprocessed_telegram_data.csv'
df_preprocessed.to_csv(output_path, index=False)

print("Preprocessing complete. Data saved to", output_path)