In [29]:
! pip install pandas




[notice] A new release of pip is available: 24.1.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Importing Required Libraries

In [30]:
import sys
import os
import pandas as pd
import re

### Loading Abbreviations and Stopwords

In [31]:
sys.path.append(os.path.join(os.getcwd(), 'data'))

import amharic_preprocessing_data

abbreviations = amharic_preprocessing_data.abbreviations_dictionary_data

stopwords = amharic_preprocessing_data.amharic_stopwords

### Displaying Sample Abbreviations

In [32]:
for key, value in list(abbreviations.items())[:10]:
    print(f"{key}: {value}")

ዶ/ር: ዶክተር
ዶር: ዶክተር
ጠ/ሚ: ጠቅላይ ሚኒስቴር
ጠ/ሚኒስትር: ጠቅላይ ሚኒስትር
ጠ/ሚኒስትርነት: ጠቅላይ ሚኒስትርነት
ጠ/ፍ/ቤት: ጠቅላይ ፍርድ ቤት
ፌ/ም/ቤት: ፌደራል ምክር ቤት
ፍ/ቤቱ: ፍርድ ቤቱ
ፍ/ቤት: ፍርድ ቤት
ፕ/ት: ፕሬዚዳንት


### Displaying Sample Stopwords

In [33]:
stopwords[:10]

['ሰሞን', 'ወዲያ', 'ጋር', 'የእሷ', 'ወይስ', 'እናንተ', 'እኔ', 'እየኖርኩ', 'እንዴት', 'ት']

### Loading Dataset

In [34]:
data = pd.read_csv(os.path.join('data', 'dataset.csv'))

for tweet in data['tweet'].head(10):
    print(tweet.ljust(50))

rows, columns = data.shape

print(f"\n Number of rows in the dataset: {rows}")

ልዩ የተፈጥሮ ገፅታ *****የምስራቅ አፍሪካ የውሃ ማማ ጮቄ            
@bobomaheder Global citizenንነቴ እና ኢትዮጵያዊነቴ ተምታተውብኝ አያውቁም የሚል ይታከልበት 😉
RT @TechinEthiopia: ሐሴትን ከበጎነት !!! የሰው ልጅ በምድር ሲመላለስ ታላቅ የመንፈስ እርካታን ከሚያጎናፅፉት ተግባራት መካከል አንዱ ለተቸገረ መድህን፤ ለወገንም አለኝታ መሆን ሲችል እንደሆነ ብዙዎቻችንን…
ለቅዳሚታችሁ 💚💛❤ አሁን በምትሰሩት ማንኛውም ነገር ምሉእነትና እርካታ ይሰማችሁ! የአሁንነት ሀይል -ለመንፈሳዊ የእውቀት ብርሃን መመሪያ በኤክሀርት ቶሌየተዘጋጀ መጽሐፍ ነው። https://t.co/RTi8kKKrd0 https://t.co/4YTikE0Amc
@tesfamaryam21 40/60 ኮንዶሚኒየም ከሆነ እመልሳለሁ ካልሆነ መልሴን አላባክንም!
@Jeberara1 ስንታየሁም ሆነ ኤልያስ ፓርላማ ለመግባት ምን ያንሳቸዋል??? ደግሞ ሁለት ሰው ብቻ ይዞ ፓርቲ ይመሰረታል ያለህ ማነው? መቸም በእስክንድር ላይ በየምክኒያቱ ዱላ የማያነሳ የለም?! በመርህ የሚመራ አንድ ፅኑ ሰው ቢኖር አልቆም አላስቀምጥ አላችሁት።
@woldeyes_t Tesfaye ለካስ ጭብል ለብሰሽ የፕሮፌሰርን ፎቶ ለጥፈክ እልም ያልክ ባዳ ነክ እፈር ትንሽ
ታኅሣሥ 10 ቀን 2012 በሞጣ ከተማ አስተዳደር በመስጅዶችና ሱቆች ላይ የደረሰውን የእሳት ቃጠሎ በማውገዝ የእስልምና ሃይማኖት ተከታዮች በባሕር ዳር ሠላማዊ ሠልፍ እያካሄዱ ይገኛሉ፡፡ #Ethiopia #EthioMuslim https://t.co/bd5u0Rutn4
አሜሪካ ሱሌይማኒን በመግደል ቀይ መስመሩን ሳታልፍ አይቀርም። ዜጎቿ አገሪቱን ለቀው እንዲወጡ አዝዛለች። ጀለሶቿ ደግሞ restrain እና de-escalate እያሉ መለማመጥ ጀምረዋል።
@has

### Normalizing Character-Level Mismatches

In [35]:
def normalize_char_level_missmatch(input_token):
    """
    Normalizes character-level mismatches in Amharic text.

    Parameters:
        input_token (str): The Amharic text to normalize.

    Returns:
        str: The normalized text.
    """
    rep1 = re.sub('[ሃኅኃሐሓኻ]', 'ሀ', input_token)
    rep2 = re.sub('[ሑኁዅ]', 'ሁ', rep1)
    rep3 = re.sub('[ኂሒኺ]', 'ሂ', rep2)
    rep4 = re.sub('[ኌሔዄ]', 'ሄ', rep3)
    rep5 = re.sub('[ሕኅ]', 'ህ', rep4)
    rep6 = re.sub('[ኆሖኾ]', 'ሆ', rep5)
    rep7 = re.sub('[ሠ]', 'ሰ', rep6)
    rep8 = re.sub('[ሡ]', 'ሱ', rep7)
    rep9 = re.sub('[ሢ]', 'ሲ', rep8)
    rep10 = re.sub('[ሣ]', 'ሳ', rep9)
    rep11 = re.sub('[ሤ]', 'ሴ', rep10)
    rep12 = re.sub('[ሥ]', 'ስ', rep11)
    rep13 = re.sub('[ሦ]', 'ሶ', rep12)
    rep14 = re.sub('[ዓኣዐ]', 'አ', rep13)
    rep15 = re.sub('[ዑ]', 'ኡ', rep14)
    rep16 = re.sub('[ዒ]', 'ኢ', rep15)
    rep17 = re.sub('[ዔ]', 'ኤ', rep16)
    rep18 = re.sub('[ዕ]', 'እ', rep17)
    rep19 = re.sub('[ዖ]', 'ኦ', rep18)
    rep20 = re.sub('[ጸ]', 'ፀ', rep19)
    rep21 = re.sub('[ጹ]', 'ፁ', rep20)
    rep22 = re.sub('[ጺ]', 'ፂ', rep21)
    rep23 = re.sub('[ጻ]', 'ፃ', rep22)
    rep24 = re.sub('[ጼ]', 'ፄ', rep23)
    rep25 = re.sub('[ጽ]', 'ፅ', rep24)
    rep26 = re.sub('[ጾ]', 'ፆ', rep25)
    rep27 = re.sub('(ሉ[ዋአ])', 'ሏ', rep26)
    rep28 = re.sub('(ሙ[ዋአ])', 'ሟ', rep27)
    rep29 = re.sub('(ቱ[ዋአ])', 'ቷ', rep28)
    rep30 = re.sub('(ሩ[ዋአ])', 'ሯ', rep29)
    rep31 = re.sub('(ሱ[ዋአ])', 'ሷ', rep30)
    rep32 = re.sub('(ሹ[ዋአ])', 'ሿ', rep31)
    rep33 = re.sub('(ቁ[ዋአ])', 'ቋ', rep32)
    rep34 = re.sub('(ቡ[ዋአ])', 'ቧ', rep33)
    rep35 = re.sub('(ቹ[ዋአ])', 'ቿ', rep34)
    rep36 = re.sub('(ሁ[ዋአ])', 'ኋ', rep35)
    rep37 = re.sub('(ኑ[ዋአ])', 'ኗ', rep36)
    rep38 = re.sub('(ኙ[ዋአ])', 'ኟ', rep37)
    rep39 = re.sub('(ኩ[ዋአ])', 'ኳ', rep38)
    rep40 = re.sub('(ዙ[ዋአ])', 'ዟ', rep39)
    rep41 = re.sub('(ጉ[ዋአ])', 'ጓ', rep40)
    rep42 = re.sub('(ደ[ዋአ])', 'ዷ', rep41)
    rep43 = re.sub('(ጡ[ዋአ])', 'ጧ', rep42)
    rep44 = re.sub('(ጩ[ዋአ])', 'ጯ', rep43)
    rep45 = re.sub('(ጹ[ዋአ])', 'ጿ', rep44)
    rep46 = re.sub('(ፉ[ዋአ])', 'ፏ', rep45)
    rep47 = re.sub('[ቊ]', 'ቁ', rep46)
    rep48 = re.sub('[ኵ]', 'ኩ', rep47)
    return rep48

### Expanding Short Forms

In [36]:
def expand_short_form(input_short_word):
    """
    Expands abbreviations in Amharic text using a provided dictionary.

    Parameters:
        input_short_word (str): The word to expand.
        short_form_dict (dict): A dictionary with abbreviations as keys and their expansions as values.

    Returns:
        str: The expanded word if found in the dictionary, otherwise the original word.
    """
    return abbreviations.get(input_short_word, input_short_word)

### Removing English Characters

In [37]:
def remove_english(text):
    """
    Removes English characters from a given text.

    Parameters:
        text (str): The input text.

    Returns:
        str: The text with English characters removed.
    """
    if isinstance(text, str):
        return re.sub(r'[a-zA-Z]', '', text)
    return text

### Removing Punctuation and Special Characters

In [38]:
def remove_punc_and_special_chars(text): 
    """
    Removes punctuation and special characters from a given text.

    Parameters:
        text (str): The input text.

    Returns:
        str: The text with punctuation and special characters removed.
    """
    normalized_text = re.sub(r'[!@#\$%\^\«\»&\*\(\)…\[\]\{\};“”›’‘"\':,.\‹/\<\>\?\\|\`\´~\-=+\፡።፤;፦፥፧፨፠፣_]', '', text)
    return normalized_text

### Removing ASCII Characters and Numbers

In [39]:
def remove_ascii_and_numbers(text_input):
    """
    Removes all ASCII characters, English numbers, and Amharic/Arabic numbers from the input text.

    Parameters:
        text_input (str): The input text to process.

    Returns:
        str: The text with ASCII characters and numbers removed.
    
    Notes:
        - ASCII characters include all English letters (A-Z, a-z) and digits (0-9).
        - Amharic/Arabic numbers are removed based on their Unicode range (U+1369 to U+137C).
        - This function is helpful for preprocessing text where only specific non-numeric and non-ASCII 
          characters (e.g., Amharic script) are required.
    """
    rm_num_and_ascii = re.sub('[A-Za-z0-9]', '', text_input)
    
    cleaned_text = re.sub('[\u1369-\u137C]+', '', rm_num_and_ascii)
    
    return cleaned_text

### Trimming Whitespace

In [40]:
def remove_whitespace(text):
    """
    Removes whitespace from the start and end of the given text.

    Parameters:
        text (str): The input text.

    Returns:
        str: The trimmed text with no leading or trailing whitespace.
    """
    return text.strip()

### Removing URLs

In [41]:
def remove_urls(text):
    """
    Removes URLs, including shortened links like 'https://t.co/...', from the input text.

    Parameters:
        text (str): The input text to process.

    Returns:
        str: The text with URLs removed.
    """
    url_pattern = r'https?://\S+|www\.\S+'
    
    cleaned_text = re.sub(url_pattern, '', text)
    
    return cleaned_text

### Removing Stopwords

In [42]:
def remove_stopwords(text):
    """
    Removes stopwords from the text.

    Parameters:
        text (str): The input text.
        stopwords (list): A list of stopwords.

    Returns:
        str: The text without stopwords.
    """
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(filtered_tokens)

### Removing Emojis

In [43]:
def remove_emojis(text):
    """
    Removes emojis from the input text.

    Parameters:
        text (str): The input text.

    Returns:
        str: The text with emojis removed.
    """
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # emotions
                                 "\U0001F300-\U0001F5FF"  # symbols & pictographs
                                 "\U0001F680-\U0001F6FF"  # transport & map symbols
                                 "\U0001F700-\U0001F77F"  # alchemical symbols
                                 "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                 "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                 "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                 "\U0001FA00-\U0001FA6F"  # Chess Symbols
                                 "\U00002702-\U000027B0"  # Dingbats
                                 "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

### Cleaning Amharic Text

In [44]:
def clean_amharic_text(text):
    """
    Cleans the given Amharic text by applying various preprocessing steps.

    Parameters:
        text (str): The input Amharic text to clean.

    Returns:
        str: The cleaned text after applying all preprocessing steps.
    """
    text = remove_urls(text)
    text = remove_english(text)
    text = remove_emojis(text)
    text = remove_ascii_and_numbers(text)
    text = remove_punc_and_special_chars(text)
    text = normalize_char_level_missmatch(text)
    text = expand_short_form(text)
    text = remove_stopwords(text)
    text = remove_whitespace(text)
    return text

### Applying Cleaning Function to Dataset

In [45]:
data['cleaned_tweet'] = data['tweet'].apply(clean_amharic_text)

### Displaying Cleaned Tweets

In [46]:
print("\nFirst 10 cleaned tweets:")
for index, tweet in enumerate(data['cleaned_tweet'].head(10)):
    print(f"{index}. {tweet.ljust(50)}")


First 10 cleaned tweets:
0. ልዩ የተፈጥሮ ገፅታ የምስራቅ አፍሪካ የውሀ ማማ ጮቄ                 
1. ኢትዮጵያዊነቴ ተምታተውብኝ አያውቁም የሚል ይታከልበት                 
2. ሀሴትን ከበጎነት የሰው ልጅ በምድር ሲመላለስ ታላቅ የመንፈስ እርካታን ከሚያጎናፅፉት ተግባራት አንዱ ለተቸገረ መድህን ለወገንም አለኝታ መሆን ሲችል ብዙዎቻችንን
3. ለቅዳሚታችሁ በምትሰሩት ማንኛውም ነገር ምሉእነትና እርካታ ይሰማችሁ የአሁንነት ሀይል ለመንፈሳዊ የእውቀት ብርሀን መመሪያ በኤክሀርት ቶሌየተዘጋጀ መፅሀፍ
4. ኮንዶሚኒየም ከሆነ እመልሳለሁ ካልሆነ መልሴን አላባክንም               
5. ስንታየሁም ኤልያስ ፓርላማ ለመግባት ያንሳቸዋል ደግሞ ሁለት ሰው ይዞ ፓርቲ ይመሰረታል ያለህ ማነው መቸም በእስክንድር በየምክኒያቱ ዱላ የማያነሳ የለም በመርህ የሚመራ ፅኑ ሰው ቢኖር አልቆም አላስቀምጥ አላችሁት
6. ለካስ ጭብል ለብሰሽ የፕሮፌሰርን ፎቶ ለጥፈክ እልም ያልክ ባዳ ነክ እፈር    
7. ታሀሳስ ቀን በሞጣ ከተማ አስተዳደር በመስጅዶችና ሱቆች የደረሰውን የእሳት ቃጠሎ በማውገዝ የእስልምና ሀይማኖት ተከታዮች በባህር ሰላማዊ ሰልፍ እያካሄዱ ይገኛሉ
8. አሜሪካ ሱሌይማኒን በመግደል ቀይ መስመሩን ሳታልፍ አይቀርም ዜጎቿ አገሪቱን ለቀው እንዲወጡ አዝዛለች ጀለሶቿ ደግሞ እያሉ መለማመጥ ጀምረዋል
9. ይሄው አይደል የእውቀትሽ ጥግበሰሚ ሰሚ ከምትናገሪ ታሪክ አታነቢምደሞ ራስሽን አታስገምቺ
