**Load the data**

In [1]:
from Data.HMM import load_data, load_pickle

train_data = load_data('train.txt')
val_data = load_data('val.txt')
test_data = load_data('test.txt')

arabic_letters = load_pickle('constants/ARABIC_LETTERS_LIST.pickle')
classes_list = load_pickle('constants/CLASSES_LIST.pickle')
diacritics_list = load_pickle('constants/DIACRITICS_LIST.pickle')

**Clean th data**

In [2]:
from Data.HMM import clean_text

In [3]:
train_data=clean_text(train_data,arabic_letters,diacritics_list)

In [4]:
test_data=clean_text(test_data,arabic_letters,diacritics_list)

**Prepare for training**

In [5]:
from Data.HMM import split_train
train_data=split_train(train_data,arabic_letters)

In [6]:
train_data[1]

[('ق', 'َ'),
 ('ا', ''),
 ('ل', 'َ'),
 (' ', ''),
 ('أ', 'َ'),
 ('ب', 'ُ'),
 ('و', ''),
 (' ', ''),
 ('ز', 'َ'),
 ('ي', 'ْ'),
 ('د', 'ٍ'),
 (' ', ''),
 ('أ', 'َ'),
 ('ه', 'ْ'),
 ('ل', 'ُ'),
 (' ', ''),
 ('ت', 'ِ'),
 ('ه', 'َ'),
 ('ا', ''),
 ('م', 'َ'),
 ('ة', 'َ'),
 (' ', ''),
 ('ي', 'ُ'),
 ('ؤ', 'َ'),
 ('ن', 'ِّ'),
 ('ث', 'ُ'),
 ('و', ''),
 ('ن', 'َ'),
 (' ', ''),
 ('ا', ''),
 ('ل', 'ْ'),
 ('ع', 'َ'),
 ('ض', 'ُ'),
 ('د', 'َ'),
 (' ', ''),
 ('و', 'َ'),
 ('ب', 'َ'),
 ('ن', 'ُ'),
 ('و', ''),
 (' ', ''),
 ('ت', 'َ'),
 ('م', 'ِ'),
 ('ي', ''),
 ('م', 'ٍ'),
 (' ', ''),
 ('ي', 'ُ'),
 ('ذ', 'َ'),
 ('ك', 'ِّ'),
 ('ر', 'ُ'),
 ('و', ''),
 ('ن', 'َ'),
 (' ', ''),
 (' ', ''),
 ('و', 'َ'),
 ('ا', ''),
 ('ل', 'ْ'),
 ('ج', 'َ'),
 ('م', 'ْ'),
 ('ع', 'ُ'),
 (' ', ''),
 ('أ', 'َ'),
 ('ع', 'ْ'),
 ('ض', 'ُ'),
 ('د', 'ٌ'),
 (' ', ''),
 ('و', 'َ'),
 ('أ', 'َ'),
 ('ع', 'ْ'),
 ('ض', 'َ'),
 ('ا', ''),
 ('د', 'ٌ'),
 (' ', ''),
 ('م', 'ِ'),
 ('ث', 'ْ'),
 ('ل', 'ُ'),
 (' ', ''),
 ('أ', 'َ'),
 ('ف', 'ْ'),
 ('ل', 'ُ

**Train the model**

In [7]:
from Train.HMM import DiacriticHMM
from os import getcwd
hmm = DiacriticHMM()
hmm.train(train_data)

model_path = f"{getcwd()}/models/arabic_diacritization_hmm.pkl"
hmm.save_model(model_path)

Training HMM model...
Training completed. States: 43, Observations: 37
Model saved to: /home/assioui/Mushakkil/models/arabic_diacritization_hmm.pkl


# **Inference**

**Prepare for testing**

In [8]:
test_data

[' قَوْلُهُ  وَلَوْ ادَّعَى وَلَدَ أَمَةٍ مُشْتَرَكَةٍ ثَبَتَ نَسَبُهُ  وَهِيَ أُمُّ وَلَدِهِ  وَلَزِمَهُ نِصْفُ قِيمَتِهَا وَنِصْفُ عُقْرِهَا لَا قِيمَتُهُ  أَمَّا ثُبُوتُ النَّسَبِ فَلِأَنَّهُ لَمَّا ثَبَتَ فِي نِصْفِهِ لِمُصَادَفَتِهِ مِلْكَهُ ثَبَتَ فِي الْبَاقِي ضَرُورَةَ أَنَّهُ لَا يَتَجَزَّأُ لِمَا أَنَّ سَبَبَهُ لَا يَتَجَزَّأُ وَهُوَ الْعُلُوقُ إذْ الْوَلَدُ الْوَاحِدُ لَا يَعْلَقُ مِنْ مَاءَيْنِ  وَأَمَّا صَيْرُورَتُهَا أُمَّ وَلَدٍ فَلِأَنَّ الِاسْتِيلَادَ لَا يَتَجَزَّأُ عِنْدَهُ وَعِنْدَهُمَا يَصِيرُ نَصِيبُهُ أُمَّ وَلَدٍ لَهُ  ثُمَّ يَتَمَلَّكُ نَصِيبَ صَاحِبِهِ إذْ هُوَ قَابِلٌ لِلْمِلْكِ  وَأَمَّا ضَمَانُ نِصْفِ الْقِيمَةِ فَلِأَنَّهُ تَمَلَّكَ نَصِيبَ صَاحِبِهِ لَمَّا اسْتَكْمَلَ الِاسْتِيلَادَ  وَأَمَّا ضَمَانُ نِصْفِ الْعُقْرِ فَلِأَنَّهُ وَطِئَ جَارِيَةً مُشْتَرَكَةً إذْ الْمِلْكُ ثَبَتَ حُكْمًا لِلِاسْتِيلَادِ فَيَعْقُبُهُ الْمِلْكُ فِي نَصِيبِ صَاحِبِهِ بِخِلَافِ الْأَبِ  إذَا اسْتَوْلَدَ جَارِيَةَ ابْنِهِ  لِأَنَّ الْمِلْكَ هُنَاكَ ثَبَتَ شَرْطًا لِلِاسْتِيلَاد

In [15]:
from Data.HMM import extract_observation
test_observations=extract_observation(test_data,arabic_letters)
test_data=split_train(test_data,arabic_letters)

In [16]:
test_data[2]

[('و', 'َ'),
 ('ا', ''),
 ('ل', 'ْ'),
 ('ه', 'َ'),
 ('ا', ''),
 ('و', 'َ'),
 ('ن', 'ُ'),
 (' ', ''),
 ('م', 'ِ'),
 ('ث', 'َ'),
 ('ا', ''),
 ('ل', 'ٌ'),
 (' ', ''),
 (' ', ''),
 ('ف', 'َ'),
 ('م', 'ِ'),
 ('ث', 'ْ'),
 ('ل', 'ُ'),
 ('ه', 'ُ'),
 (' ', ''),
 ('ك', 'ُ'),
 ('ل', 'ُّ'),
 (' ', ''),
 ('م', 'َ'),
 ('ا', ''),
 (' ', ''),
 ('ي', 'َ'),
 ('ت', 'َ'),
 ('ع', 'َ'),
 ('ذ', 'َّ'),
 ('ر', 'ُ'),
 (' ', ''),
 ('ك', 'َ'),
 ('س', 'ْ'),
 ('ر', 'ُ'),
 ('ه', 'ُ'),
 (' ', ''),
 ('ع', 'َ'),
 ('ل', 'َ'),
 ('ى', ''),
 (' ', ''),
 ('ر', 'َ'),
 ('أ', 'ْ'),
 ('س', 'ِ'),
 ('ه', 'َ'),
 ('ا', ''),
 (' ', '')]

In [17]:
test_observations[2]

['و',
 'ا',
 'ل',
 'ه',
 'ا',
 'و',
 'ن',
 ' ',
 'م',
 'ث',
 'ا',
 'ل',
 ' ',
 ' ',
 'ف',
 'م',
 'ث',
 'ل',
 'ه',
 ' ',
 'ك',
 'ل',
 ' ',
 'م',
 'ا',
 ' ',
 'ي',
 'ت',
 'ع',
 'ذ',
 'ر',
 ' ',
 'ك',
 'س',
 'ر',
 'ه',
 ' ',
 'ع',
 'ل',
 'ى',
 ' ',
 'ر',
 'أ',
 'س',
 'ه',
 'ا',
 ' ']

In [18]:
test_data

[[(' ', ''),
  ('ق', 'َ'),
  ('و', 'ْ'),
  ('ل', 'ُ'),
  ('ه', 'ُ'),
  (' ', ''),
  (' ', ''),
  ('و', 'َ'),
  ('ل', 'َ'),
  ('و', 'ْ'),
  (' ', ''),
  ('ا', ''),
  ('د', 'َّ'),
  ('ع', 'َ'),
  ('ى', ''),
  (' ', ''),
  ('و', 'َ'),
  ('ل', 'َ'),
  ('د', 'َ'),
  (' ', ''),
  ('أ', 'َ'),
  ('م', 'َ'),
  ('ة', 'ٍ'),
  (' ', ''),
  ('م', 'ُ'),
  ('ش', 'ْ'),
  ('ت', 'َ'),
  ('ر', 'َ'),
  ('ك', 'َ'),
  ('ة', 'ٍ'),
  (' ', ''),
  ('ث', 'َ'),
  ('ب', 'َ'),
  ('ت', 'َ'),
  (' ', ''),
  ('ن', 'َ'),
  ('س', 'َ'),
  ('ب', 'ُ'),
  ('ه', 'ُ'),
  (' ', ''),
  (' ', ''),
  ('و', 'َ'),
  ('ه', 'ِ'),
  ('ي', 'َ'),
  (' ', ''),
  ('أ', 'ُ'),
  ('م', 'ُّ'),
  (' ', ''),
  ('و', 'َ'),
  ('ل', 'َ'),
  ('د', 'ِ'),
  ('ه', 'ِ'),
  (' ', ''),
  (' ', ''),
  ('و', 'َ'),
  ('ل', 'َ'),
  ('ز', 'ِ'),
  ('م', 'َ'),
  ('ه', 'ُ'),
  (' ', ''),
  ('ن', 'ِ'),
  ('ص', 'ْ'),
  ('ف', 'ُ'),
  (' ', ''),
  ('ق', 'ِ'),
  ('ي', ''),
  ('م', 'َ'),
  ('ت', 'ِ'),
  ('ه', 'َ'),
  ('ا', ''),
  (' ', ''),
  ('و', 'َ'),
  ('ن', 'ِ')

In [19]:
hmm_from_file = DiacriticHMM.load_from_file(model_path)

Model loaded from: /home/assioui/Mushakkil/models/arabic_diacritization_hmm.pkl
Model info - States: 43, Observations: 37


In [20]:
predictions = hmm_from_file.predict(test_observations)

In [22]:
print(predictions[1])

[('ق', 'َ'), ('و', 'َ'), ('ل', 'ْ'), ('ه', 'ُ'), (' ', ''), (' ', ''), (' ', ''), ('و', 'َ'), ('ب', 'َ'), ('ح', 'ْ'), ('ث', 'ُ'), (' ', ''), ('ا', ''), ('ل', 'َ'), ('ر', 'َ'), ('ا', ''), ('ف', 'َ'), ('ع', 'َ'), ('ي', 'ْ'), (' ', ''), ('ص', 'َ'), ('ح', 'ْ'), ('ت', 'َ'), ('ه', 'ُ'), ('ا', ''), (' ', ''), (' ', ''), ('و', 'َ'), ('إ', ''), ('ن', 'َ'), (' ', ''), ('ق', 'َ'), ('ص', 'ْ'), ('د', 'ِ'), (' ', ''), ('ت', 'َ'), ('م', 'ْ'), ('ل', 'َ'), ('ي', 'ْ'), ('ك', 'َ'), (' ', ''), ('ا', ''), ('ل', 'َ'), ('م', 'َ'), ('س', 'ْ'), ('ج', 'َ'), ('د', 'َ'), (' ', ''), ('و', 'َ'), ('ه', 'ُ'), ('و', 'َ'), (' ', ''), ('ا', ''), ('ل', 'َ'), ('م', 'ْ'), ('ع', 'َ'), ('ت', 'َ'), ('م', 'ْ'), ('د', 'ِ'), (' ', ''), (' ', ''), ('و', 'َ'), ('ع', 'َ'), ('ل', 'ْ'), ('م', 'ُ'), (' ', ''), ('م', 'َ'), ('ن', 'ْ'), (' ', ''), ('ت', 'َ'), ('ع', 'َ'), ('ل', 'ْ'), ('ي', 'َ'), ('ل', 'ْ'), ('ه', 'ُ'), (' ', ''), ('ب', 'ِ'), ('أ', 'َ'), ('ن', 'ْ'), (' ', ''), (' ', ''), (' ', ''), (' ', '')]


In [21]:
der = hmm_from_file.calculate_diacritic_error_rate(predictions, test_data)
print(f"\nDiacritic Error Rate: {der:.4f} ({der*100:.2f}%)")


Diacritic Error Rate: 0.3975 (39.75%)
