Exercise 1: Build a Combined English + Amharic Pipeline. Use paper based stop words, tokenizers and stemmers. Include the resources and papers you used in the colab.

1.1 Detect language automatically

1.2 Apply correct pipeline

1.3 Output value

In [None]:
!pip install nltk



In [None]:
import nltk, re, unicodedata
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## return type definition

In [None]:
class PreProcessedResult:
  def __init__(self, lang, original_text, stemmed_text, lemmatized_text):
    self.lang = lang
    self.original_text = original_text
    self.stemmed_text = stemmed_text
    self.lemmatized_text = lemmatized_text

## English Text processing and Normalization

In [None]:
def english_preprocessing(text):
  text = text.lower();

  # remove punctation
  no_punc_text = re.sub(r'[^\w\s]', '', text)

  # tokenize based on words
  tokens = word_tokenize(no_punc_text)

  # stop word removal
  stop_words = set(stopwords.words("english"))
  no_stop_text = [ w for w in tokens if w not in stop_words]

  # stemming
  stemmer = PorterStemmer()
  stemmed_text = [stemmer.stem(w) for w in no_stop_text]

  # lemmatization
  lemmatizer = WordNetLemmatizer()
  lemmatized_text = [lemmatizer.lemmatize(w) for w in no_stop_text]

  return PreProcessedResult("en", text, stemmed_text,  lemmatized_text)

eng_text = "Hello! This is nahom, an incoming full-time software engineer at bloomberg!"
result = english_preprocessing(eng_text)


## Amharic Text Proccessing and Normalization

In [None]:
# remove punctuation
def remove_punc(text):
  # Ethiopic punctuation set
  ethiopic_punct = "፠፡።፣፤፥፦፧፨"
  # Remove all punctuation (ASCII + Ethiopic)
  no_punct_text = re.sub(f"[{ethiopic_punct}!?,;:]", "", text)
  return no_punct_text


# stop word removal
def remove_stop_words(text, stop_word_drive_path = "/content/drive/MyDrive/datasets/amstopwords.txt"):
  stop_words = []
  with open(stop_word_drive_path, "r") as f:
    stop_words = f.read().splitlines()
  no_stop_text = [w for w in text if w not in stop_words]
  return no_stop_text



In [None]:
# amharic stemmming
stemming_suffixes = ["ን", "ው","ች", "ት"]
def simple_am_stem(word):
    for s in stemming_suffixes:
        if word.endswith(s):
            return word[:-len(s)]
    return word
def amahric_stemming(text):
  am_stemmed = [simple_am_stem(w) for w in text]
  return am_stemmed

In [None]:
# amahric lemmatization
amharic_suffixes = [
  "ነው", "ሁሉ", "ች", "ዎች", "ን", "ው", "ም", "ኩ", "ህ", "ሽ",
  "ኛ", "ኝ", "ት", "ያ", "ዋ", "ሁ"
]

def lemmatize_word(word):
  for suffix in sorted(amharic_suffixes, key=len, reverse=True):
      if word.endswith(suffix):
          return word[:-len(suffix)]
  return word
def amahric_lemmatizer(text):
  lemmatized_text = [lemmatize_word(w) for w in text]
  return lemmatized_text

In [None]:
def amaharic_preprocessing(text):
  # amahric does not have lower and upper cases

  # remove punctation
  no_punc_text = remove_punc(text)

  # tokenize based on words
  tokens = no_punc_text.split()

  # stop word removal
  no_stop_text = remove_stop_words(tokens)

  # stemming
  stemmed_text = amahric_stemming(no_stop_text)

  # lemmatization
  lemmatized_text = amahric_lemmatizer(no_stop_text)

  return PreProcessedResult("am", text, stemmed_text, lemmatized_text)

## language detection

In [None]:
!pip install langdetect
from langdetect import detect



In [None]:

def detect_language(text):
  for c in text:
    if 0x1200 <= ord(c) <= 0x137F:
      return "am"
    elif c.isascii() and c.isalpha():
      return "en"

  return "unknown"

def detect_language_library(text):
  return detect(text)

In [None]:
def CombinedPipeline(text):
  lang = detect_language(text)
  if lang == "am":
    ans = amaharic_preprocessing(text)
  elif lang == "en":
    ans = english_preprocessing(text)
  else:
    ans = PreProcessedResult(lang, text, [], [])
  return ans

In [None]:
eng_text = '''ከአየር ጤና ወለቴ ያለው መንገድ ግንባታ ላይ እንደሆነ ይታወቃል።
የአዲስ አበባ ከተማ መንገዶች ባለሥልጣን፤ " በመገንባት ላይ የሚገኘውን የመንገድ ፕሮጀክት በፍጥነት ለማጠናቀቅ በከፍተኛ ትኩረት እየሰራው ነው " ብሏል።
'''
result = CombinedPipeline(eng_text)

print(result.lang)
print(result.original_text)
print("stemmed text: ", result.stemmed_text)
print("lemmatized_text: ", result.lemmatized_text)

am
ከአየር ጤና ወለቴ ያለው መንገድ ግንባታ ላይ እንደሆነ ይታወቃል።
የአዲስ አበባ ከተማ መንገዶች ባለሥልጣን፤ " በመገንባት ላይ የሚገኘውን የመንገድ ፕሮጀክት በፍጥነት ለማጠናቀቅ በከፍተኛ ትኩረት እየሰራው ነው " ብሏል።

stemmed text:  ['ከአየር', 'ጤና', 'ወለቴ', 'መንገድ', 'ግንባታ', 'እንደሆነ', 'ይታወቃል', 'የአዲስ', 'አበባ', 'ከተማ', 'መንገዶ', 'ባለሥልጣ', '"', 'በመገንባ', 'የሚገኘው', 'የመንገድ', 'ፕሮጀክ', 'በፍጥነ', 'ለማጠናቀቅ', 'በከፍተኛ', 'ትኩረ', 'እየሰራ', '"', 'ብሏል']
lemmatized_text:  ['ከአየር', 'ጤና', 'ወለቴ', 'መንገድ', 'ግንባታ', 'እንደሆነ', 'ይታወቃል', 'የአዲስ', 'አበባ', 'ከተማ', 'መንገዶ', 'ባለሥልጣ', '"', 'በመገንባ', 'የሚገኘው', 'የመንገድ', 'ፕሮጀክ', 'በፍጥነ', 'ለማጠናቀቅ', 'በከፍተ', 'ትኩረ', 'እየሰራ', '"', 'ብሏል']
