#### Classes

In [19]:
import numpy as np

class Semitic:
  def __init__(self):
    self.p_model = None
    self.h_model = None
    self.l_model = None
    self.vocabulary = ['ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ',
                       'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك',
                       'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي']

  def load_classifiers(self) -> None:
    '''
    Loads the trained models into the system
    '''
    import pickle

    p_model_path = '/content/Semitic/trained_models/first_char/mlp_first_char_2024-03-25 11_48.pkl'
    with open(p_model_path, 'rb') as file:
      self.p_model = pickle.load(file)

    h_model_path = '/content/Semitic/trained_models/second_char/mlp_second_char_2024-03-25 12_04.pkl'
    with open(h_model_path, 'rb') as file:
      self.h_model = pickle.load(file)

    l_model_path = '/content/Semitic/trained_models/third_char/mlp_third_char_2024-03-25 12_18.pkl'
    with open(l_model_path, 'rb') as file:
      self.l_model = pickle.load(file)

    print("Loaded models into the system.")
    return None


  def get_stem(self, word: str) -> str:
    '''
    Returns the root of the word
    '''
    word = Semitic.remove_diacritic(word)
    p = self.predict_p(word)
    h = self.predict_h(word)
    l = self.predict_l(word)
    stem = p + h + l
    return stem


  def predict_p(self, word) -> str:
    '''
    Returns the first stem letter of the word
    '''
    import pandas as pd
    import numpy as np
    test = pd.DataFrame({'words': [word]})
    test['word_as_matrix'] = test['words'].apply(lambda x: self.string_vectorizer(x, self.vocabulary, 8))
    X_test_real = np.stack(test['word_as_matrix'].to_numpy())
    return self.vocabulary[np.argmax(self.p_model.predict(X_test_real, verbose = 0))]


  def predict_h(self, word) -> str:
    '''
    Returns the second stem letter of the word
    '''
    import pandas as pd
    import numpy as np
    test = pd.DataFrame({'words': [word]})
    test['word_as_matrix'] = test['words'].apply(lambda x: self.string_vectorizer(x, self.vocabulary, 8))
    X_test_real = np.stack(test['word_as_matrix'].to_numpy())
    return self.vocabulary[np.argmax(self.h_model.predict(X_test_real, verbose = 0))]


  def predict_l(self, word) -> str:
    '''
    Returns the third stem letter of the word
    '''
    import pandas as pd
    import numpy as np
    test = pd.DataFrame({'words': [word]})
    test['word_as_matrix'] = test['words'].apply(lambda x: self.string_vectorizer(x, self.vocabulary, 8))
    X_test_real = np.stack(test['word_as_matrix'].to_numpy())
    return self.vocabulary[np.argmax(self.l_model.predict(X_test_real, verbose = 0))]


  @staticmethod
  def string_vectorizer(string, vocabulary, max_len) -> np.array:
    '''
    Returns a matrix representation of the string
    '''
    empty = Semitic.empty_matrix(max_len, vocabulary)
    for i,l in enumerate(string):
      empty[i, vocabulary.index(l)] = 1
    matrix_representation = empty
    return matrix_representation


  @staticmethod
  def empty_matrix(max_len: int, vocabulary: list) -> np.array:
    import numpy as np
    array = []
    for i in range(max_len):
      array.append([0] * len(vocabulary))
    return np.array(array)


  @staticmethod
  def remove_diacritic(text: str):
    text = ''.join([t for t in text if t not in ['ِ', 'ُ', 'ٓ', 'ٰ', 'ْ', 'ٌ', 'ٍ', 'ً', 'ّ', 'َ']])
    return text


In [20]:
class Test_Semitic:
  @staticmethod
  def get_stem_df(x):
    try:
      print("checked")
      return semitic.get_stem(x)
    except:
      print("error")
      return "error"

  @staticmethod
  def test_full_equality(row):
    return 1 if row['predicted_root'] == row['root'] else 0

  @staticmethod
  def test_first(row):
    return 1 if row['first_root_letter'] == row['first_pred_letter'] else 0

  @staticmethod
  def test_second(row):
    return 1 if row['second_root_letter'] == row['second_pred_letter'] else 0

  @staticmethod
  def test_third(row):
    return 1 if row['third_root_letter'] == row['third_pred_letter'] else 0


  @staticmethod
  def at_least_one_not_in_word(row):
    '''
    Returns 1 if the word has at least one letter that is not in the word
    '''
    root = row['root']
    word = row['word']
    sum = 0
    for i in range(len(root)):
      if root[i] in word:
        sum += 1
    return 0 if sum == 3 else 1


#### Test

In [2]:
!git clone https://github.com/delmedigo88/Semitic.git

fatal: destination path 'Semitic' already exists and is not an empty directory.


In [3]:
semitic  = Semitic()
semitic.load_classifiers()

Loaded models into the system.


In [4]:
semitic.get_stem('مكتوب')

'كتب'

In [5]:
# load test data
import pandas as pd
test_df = pd.read_excel('Semitic/data/test_data/word-root-table_side.xlsx')


In [None]:
test_df = test_df[test_df.root.str.len() == 3].sample(10000).copy()
count = 0
def get_stem_df(x):
  try:
    print("checked")
    return semitic.get_stem(x)
  except:
    print("error")
    return "error"
test_df['predicted_root'] = test_df['word'].apply(lambda x: get_stem_df(x))

def test(row):
  return 1 if row['predicted_root'] == row['root'] else 0

test_df['check'] =  test_df.apply(test, axis=1)
test_df.check.mean()

In [42]:
df_metrics = test_df[['word','root','predicted_root','check']].copy()
df_metrics['first_root_letter'] = df_metrics['root'].apply(lambda x: x[0])
df_metrics['first_pred_letter'] = df_metrics['predicted_root'].apply(lambda x: x[0])

df_metrics['second_root_letter'] = df_metrics['root'].apply(lambda x: x[1])
df_metrics['second_pred_letter'] = df_metrics['predicted_root'].apply(lambda x: x[1])

df_metrics['third_root_letter'] = df_metrics['root'].apply(lambda x: x[2])
df_metrics['third_pred_letter'] = df_metrics['predicted_root'].apply(lambda x: x[2])

df_metrics['t1'] = df_metrics.apply(lambda x: Test_Semitic.test_first(x), axis=1)
df_metrics['t2'] = df_metrics.apply(lambda x: Test_Semitic.test_second(x), axis=1)
df_metrics['t3'] = df_metrics.apply(lambda x: Test_Semitic.test_third(x), axis=1)

df_metrics[['word','root','predicted_root','t1', 't2', 't3']] ##pairwise accurcy
df_metrics['pairwise_acc'] = (df_metrics['t1'] + df_metrics['t2'] + df_metrics['t3']) / 3
df_metrics['correct'] = df_metrics['t1'] + df_metrics['t2'] + df_metrics['t3']

df_metrics['at_least_one'] = df_metrics.apply(lambda x: Test_Semitic.at_least_one_not_in_word(x), axis=1)



at_least_one
0    0.977234
1    0.903415
Name: check, dtype: float64

In [35]:
df_metrics

Unnamed: 0,word,root,predicted_root,first_root_letter,first_pred_letter,second_root_letter,second_pred_letter,third_root_letter,third_pred_letter,t1,t2,t3,pairwise_acc,correct,at_least_one
7203,افنع,فنع,فنع,ف,ف,ن,ن,ع,ع,1,1,1,1.0,3,0
14259,تهاربنا,هرب,هرب,ه,ه,ر,ر,ب,ب,1,1,1,1.0,3,0
4840,يستأو,ءوي,ءوي,ء,ء,و,و,ي,ي,1,1,1,1.0,3,1
25116,متدفأة,دفء,دفء,د,د,ف,ف,ء,ء,1,1,1,1.0,3,1
41124,ألخصتم,لخص,لخص,ل,ل,خ,خ,ص,ص,1,1,1,1.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31569,جايتوا,جيت,جيت,ج,ج,ي,ي,ت,ت,1,1,1,1.0,3,0
12585,يحتلجن,حلج,حلج,ح,ح,ل,ل,ج,ج,1,1,1,1.0,3,0
39842,مماجنتان,مجن,مجن,م,م,ج,ج,ن,ن,1,1,1,1.0,3,0
42843,مترصعان,رصع,رصع,ر,ر,ص,ص,ع,ع,1,1,1,1.0,3,0


In [65]:
# Generate test report:

# Build Training Report:

from datetime import datetime

now = datetime.now()

current_date = now.strftime("%Y-%m-%d")
current_hour = now.hour
current_minute = now.minute

test_summary = f"Report Time: {current_date} {current_hour:02d}:{current_minute:02d}\n"
test_summary += f"Number of words: {len(test_df)}\n"
test_summary += f"Number of correct words: {test_df.check.sum()}\n"
test_summary += f"Number of incorrect words: {test_df.check.count() - test_df.check.sum()}\n"
test_summary += f"Accuracy: {test_df.check.mean():.2%}\n"
test_summary += f"Pairwise Accuracy: {df_metrics['pairwise_acc'].mean():.2%}\n"
test_summary += f"0 letters accuracy: {df_metrics.correct.value_counts()[3]}\n"
test_summary += f"1 letters accuracy: {df_metrics.correct.value_counts()[2]}\n"
test_summary += f"2 letters accuracy: {df_metrics.correct.value_counts()[1]}\n"
test_summary += f"3 letters accuracy: {df_metrics.correct.value_counts()[0]}\n"
test_summary += f"Only some root letteres in word percent: {df_metrics.query('at_least_one == 1').count()[0] / len(test_df):.2%}\n"
test_summary += f"Only some root letteres in word accuracy: {df_metrics.groupby('at_least_one').check.mean()[1]:.2%}\n"
test_summary += f"All root letters in word accuracy: {df_metrics.groupby('at_least_one').check.mean()[0]:.2%}\n"
print(test_summary)

with open(f'test_summary{current_date} {current_hour:02d}:{current_minute:02d}.txt', 'w') as f:
    f.write(test_summary)

df_metrics.to_excel(f'test_summary{current_date} {current_hour:02d}:{current_minute:02d}.xlsx')

# download files

from google.colab import files
files.download(f'test_summary{current_date} {current_hour:02d}:{current_minute:02d}.xlsx')
files.download(f'test_summary{current_date} {current_hour:02d}:{current_minute:02d}.txt')

Report Time: 2024-03-25 15:58
Number of words: 10000
Number of correct words: 9634
Number of incorrect words: 366
Accuracy: 96.34%
Pairwise Accuracy: 98.44%
0 letters accuracy: 9634
1 letters accuracy: 284
2 letters accuracy: 62
3 letters accuracy: 20
Only some root letteres in word percent: 18.74%
Only some root letteres in word accuracy: 90.34%
All root letters in word accuracy: 97.72%



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>