In [1]:
DATASET_FOLDER = 'dataset'
CLEAN_OUT_FOLDER = 'clean_out'
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Autoreload
%load_ext autoreload


In [2]:
# read dataset train 
def read_dataset(file_name='train.txt', verbose=False):
    """
    takes a file name and returns a pandas dataframe of the dataset 
    the returnes is a pandas series of the dataset 
    """

    dataset = os.path.join(DATASET_FOLDER, file_name)
    # read dataset as a text file and each line as a training example
    dataset = pd.read_csv(dataset, sep='\t', header=None).squeeze('columns')

    if verbose:
        print('dataset shape: ', dataset.shape)
        print('first 5 examples: ')
        print(dataset[:5])

    return dataset


read_dataset(verbose=True);

dataset shape:  (55000,)
first 5 examples: 
0    قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَ...
1    ابْنُ عَرَفَةَ : قَوْلُهُ : بِلَفْظٍ يَقْتَضِي...
2    ( قَوْلُهُ لِعَدَمِ مَا تَتَعَلَّقُ إلَخْ ) أَ...
3                       وَحَيَوَانٌ غَيْرُ مَوْجُودٍ .
4    فَائِدَةٌ : قَالَ بَعْضُهُمْ : يُؤْخَذُ مِنْ ش...
Name: 0, dtype: object


In [3]:
text = 'تَغْتَرَّ'
print(len(text))

# shadda and other diacritics are single characters
for i in range(len(text)):
    print(text[i], ord(text[i]))

print('--------------------')

print("يّيلعبُُُ")


9
ت 1578
َ 1614
غ 1594
ْ 1618
ت 1578
َ 1614
ر 1585
ّ 1617
َ 1614
--------------------
يّيلعبُُُ


In [4]:
# geet all the tashkeel in the text 
from constants import ARABIC_LETTERS

All_NON_Letters = set()
for text in read_dataset():
    for c in text:
        if c not in ARABIC_LETTERS:
            All_NON_Letters.add(c)

print(All_NON_Letters)

{'2', '«', '؟', 'ُ', 'ٌ', '~', '(', '3', '-', '»', 'ْ', '9', 'ٍ', '؛', ':', '1', '6', '!', '4', ';', '0', '[', '7', '*', '–', '{', '5', '\u200f', '`', 'ً', ']', '8', '}', '.', ',', ')', '"', 'َ', ' ', '/', 'ّ', '،', "'", 'ِ'}


In [5]:
from regex import P
from constants.arabic import HARAKAT
from constants import PUNCTUATIONS
from constants.arabic import ARABIC_NUMBERS

import re
import re
from constants.arabic import ARABIC_NUMBERS
from constants import PUNCTUATIONS

# Compile regular expressions
number_regex = re.compile(r'[0-9]')
arabic_number_regex = re.compile(r'[' + "".join(ARABIC_NUMBERS) + ']')
punctuation_regex = re.compile(r'[' + "".join(PUNCTUATIONS) + ']')
spaces_regex = re.compile(r'\s+')


def clean_text(text):
    # Delete all numbers and punctuation
    # Remove numbers
    text = number_regex.sub('', text)
    text = arabic_number_regex.sub('', text)
    # Remove punctuation marks
    # text = punctuation_regex.sub('', text)
    # Compress all spaces
    text = spaces_regex.sub(' ', text)

    text = text.strip()

    return text


text = '   0110ز؟:.+_@#$.زتَغْتَرَّ2//،،312)()( جداى آيئءؤرلاىةوزظلآج)'
print(clean_text(text))

dataset = read_dataset()

for t in dataset[:5]:
    print(t)
    print(clean_text(t))
    print('---------------------')
    break

non_letters = ' '.join(list(All_NON_Letters))
print(non_letters)
cleaned = clean_text(non_letters)
print(cleaned)  # only tashkeel  and spaces left -> for now 
# print(PUNCTUATIONS)

ز؟:.+_@#$.زتَغْتَرَّ//،،)()( جداى آيئءؤرلاىةوزظلآج)
قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( 14 / 123 )
قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( / )
---------------------
2 « ؟ ُ ٌ ~ ( 3 - » ْ 9 ٍ ؛ : 1 6 ! 4 ; 0 [ 7 * – { 5 ‏ ` ً ] 8 } . , ) " َ   / ّ ، ' ِ
« ؟ ُ ٌ ~ ( - » ْ ٍ ؛ : ! ; [ * – { ‏ ` ً ] } . , ) " َ / ّ ، ' ِ


In [6]:
# Create X and Y to the model
# X is the input and Y is the output (the target)
# X is a list of all the characters in the dataset
# Y is a list of all Diacritics in the dataset (the target)
# if the character has no diacritic, the diacritic is set to be $ (empty diacritic)
from constants.arabic import HARAKAT, SHADDA, ARABIC_LETTERS
from train_collections import harakat2id
from tqdm import tqdm
import pandas as pd

train = read_dataset()


def xy_dataset(dataset, is_test=False):
    """
    dataset: pandas series of the dataset (each example is a string)
    return: X, Y as lists
    """
    X = []
    X_words = []
    Y = []

    for line in tqdm(dataset):
        cleaned_line = clean_text(line)
        delimiters = [",", "|", ";", "؛", ".", ":", "(", ")", "<", ">", "[", "]", "{", "}"]
        splited_lines = re.split('[' + ''.join(map(re.escape, delimiters)) + ']', cleaned_line)
        for cleaned_line in splited_lines:

            if len(cleaned_line) < 1 or (len(cleaned_line) < 4 and not is_test):
                continue

            line_x = []
            line_y = []
            i = 0
            while i < len(cleaned_line):
                c = cleaned_line[i]
                line_x.append(c)

                # if this is the last character in the line or the next character is not a tashkeel 
                # then the tashkeel is empty
                if i == len(cleaned_line) - 1 or cleaned_line[i + 1] not in HARAKAT:
                    line_y.append('$')
                    i += 1
                    continue

                i += 1
                tashkeel = cleaned_line[i]
                # if this is a shadda, we need to add the next character to the tashkeel
                # as shadda  ّ dont come alone
                if tashkeel == SHADDA and (i < len(cleaned_line) - 1 and cleaned_line[i + 1] in HARAKAT):
                    i += 1
                    tashkeel += cleaned_line[i]

                line_y.append(tashkeel)

                i += 1
            # add words to X_words
            X_words.append(''.join(line_x).split())
            X.append(line_x)
            Y.append(line_y)

    return X_words, X, Y


train_set = read_dataset()
X_words, X, Y = xy_dataset(train_set)

# make sure they are the same length (each x has a y of the same length)
for x, y in zip(X, Y):
    assert len(x) == len(y)


def convert_to_gold_standard_format(X, Y, name='train'):
    """
    X is a list of lists of characters
    Y is a list of lists of diacritics
    return is a csv file in the gold standard format 
    ID,label
    """
    pfile = pd.DataFrame(columns=['ID', 'letter', 'label'])
    pairs = []
    for sent, tags in zip(X, Y):
        for c, t in zip(sent, tags):
            if c in ARABIC_LETTERS:
                if t == '$':
                    t = ''
                pairs.append([c, harakat2id[t]])

    pfile['ID'] = [i for i in range(len(pairs))]
    pfile['letter'] = [pair[0] for pair in pairs]
    pfile['label'] = [pair[1] for pair in pairs]
    pfile.to_csv(f"./clean_out/{name}_gold" + '.csv', index=False)


# save outs into 3 files in CLEAN_OUT_FOLDER   

def save_dataset(X, Y, X_words, x_file='X.csv', y_file='Y.csv', x_words_file='X_words.txt'):
    """
    save X, Y, X_words into 3 files in CLEAN_OUT_FOLDER
    """
    # make sure the folder exists
    if not os.path.exists(CLEAN_OUT_FOLDER):
        os.makedirs(CLEAN_OUT_FOLDER)
    # save X
    with open(os.path.join(CLEAN_OUT_FOLDER, x_file), 'w', encoding="utf8") as f:
        for line in X:
            f.write('s'.join(line) + '\n')

    # save Y
    with open(os.path.join(CLEAN_OUT_FOLDER, y_file), 'w', encoding="utf8") as f:
        for line in Y:
            f.write('s'.join(line) + '\n')

    # save X_words
    with open(os.path.join(CLEAN_OUT_FOLDER, x_words_file), 'w', encoding="utf8") as f:
        for line in X_words:
            f.write(' '.join(line) + '\n')


save_dataset(X, Y, X_words)

convert_to_gold_standard_format(X, Y)




100%|██████████| 55000/55000 [00:13<00:00, 4086.31it/s]


In [7]:
# print them side by side 
print(len(X[544]))
print(len(Y[544]))
for x, y in zip(X[544], Y[544]):
    print(x, y)

194
194
  $
ب َ
ي َّ
ن َ
  $
ف ِ
ي $
  $
ش َ
ر ْ
ح ِ
  $
ا $
ل $
ر َّ
و ْ
ض ِ
  $
أ َ
ن َّ
  $
إ $
ج ْ
ز َ
ا $
ء َ
  $
ذ َ
ب ْ
ح ِ
ه ِ
  $
ف ِ
ي $
  $
س َ
ن َ
ة ِ
  $
ا $
ل ْ
ق َ
ض َ
ا $
ء ِ
  $
ب َ
ع ْ
د َ
  $
د ُ
خ ُ
و $
ل ِ
  $
و َ
ق ْ
ت ِ
ه ِ
  $
و َ
ق َ
ب ْ
ل َ
  $
ا $
ل ْ
إ ِ
ح ْ
ر َ
ا $
م ِ
  $
ب ِ
ه ِ
  $
ه ُ
و َ
  $
م َ
ا $
  $
د َ
ل َّ
  $
ع َ
ل َ
ي ْ
ه ِ
  $
ك َ
ل َ
ا $
م ُ
  $
أ َ
ص ْ
ل ِ
ه ِ
  $
ت َ
ب َ
ع ً
ا $
  $
ل ِ
ل ْ
ع ِ
ر َ
ا $
ق ِ
ي ِّ
ي $
ن َ
  $
، $
  $
و َ
أ َ
ن َّ
  $
م َ
ا $
  $
و َ
ق َ
ع َ
  $
ف ِ
ي $
  $
ا $
ل $
ر َّ
و ْ
ض ِ
  $
م ِ
م َّ
ا $
  $
ي ُ
خ َ
ا $
ل ِ
ف ُ
  $
ذ َ
ل ِ
ك َ
  $
م ِ
ن ْ
  $
ت َ
ص َ
ر ُّ
ف ِ
ه ِ
  $
ق َ
ا $
ل َ
  $
ه َ
ك َ
ذ َ
ا $
  $
أ َ
ف ْ
ه َ
م ُ
  $
و َ
ل َ
ا $
  $
ت َ
غ ْ
ت َ
ر َّ
  $
ب ِ
م َ
ا $
  $
ي ُ
خ َ
ا $
ل ِ
ف ُ
ه ُ
  $


In [8]:
# val dataset too 
# val_dataset_f = os.path.join(DATASET_FOLDER, 'val.txt')
# read dataset as a text file and each line as a training example
val_dataset = read_dataset('test_with_diacritics.txt', verbose=True)
X_words, X, Y = xy_dataset(val_dataset)
convert_to_gold_standard_format(X, Y, name='test_with_diacritics')
save_dataset(X, Y, X_words, x_file='X_test_with_diacritics.csv', y_file='Y_test_with_diacritics.csv', x_words_file='X_words_test_with_diacritics.txt')


dataset shape:  (2494,)
first 5 examples: 
0    لَيْسَ لِلْوَكِيلِ بِالْقَبْضِ أَنْ يُبَرِّأَ ...
1    ( قَوْلُهُ وَيَقَعُ فِي بَعْضِ النُّسَخِ بِمَن...
2    وَمَا ثَبَتَ بِظَنِّيٍّ سَاقِطٍ مِنْ قِسْمِ ال...
3    26093 - حَدَّثَنَا عُثْمَانُ بْنُ عُمَرَ قَالَ...
4    لِأَنَّ الْحَمْلَ فِيهَا لَا يُفِيدُ أُمَيَّةَ...
Name: 0, dtype: object


100%|██████████| 2494/2494 [00:00<00:00, 4108.51it/s]


In [9]:
# val dataset too 
# val_dataset_f = os.path.join(DATASET_FOLDER, 'val.txt')
# read dataset as a text file and each line as a training example
val_dataset = read_dataset('val.txt', verbose=True)
X_words, X, Y = xy_dataset(val_dataset)
convert_to_gold_standard_format(X, Y, name='val')
save_dataset(X, Y, X_words, x_file='X_val.csv', y_file='Y_val.csv', x_words_file='X_words_val.txt')


dataset shape:  (2500,)
first 5 examples: 
0       ( 27 ) قَوْلُهُ : وَلَا تُكْرَهُ ضِيَافَتُهُ .
1    ( الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ ...
2    ( قَوْلُهُ : وَهُوَ ) أَيْ : الْبَيْعُ بِالْمَ...
3    وَالْعَفْوُ قَبْلَ الْإِمَامِ ، أَوْ بَعْدَهُ ...
4    ( قَوْلُهُ : وَرِبْحُهُ ) أَيْ الْقِرَاضِ وَقَ...
Name: 0, dtype: object


100%|██████████| 2500/2500 [00:00<00:00, 4721.53it/s]


In [10]:
# test dataset too
test_dataset = read_dataset('test.txt', verbose=True)
X_words, X, Y = xy_dataset(test_dataset, is_test=True)
convert_to_gold_standard_format(X, Y, name='test')
save_dataset(X, Y, X_words, x_file='X_test.csv', y_file='Y_test.csv', x_words_file='X_words_test.txt')

dataset shape:  (2500,)
first 5 examples: 
0    ( قَوْلُهُ : وَلَوْ ادَّعَى وَلَدَ أَمَةٍ مُشْ...
1    قَوْلُهُ : ( وَبَحَثَ الرَّافِعِيُّ صِحَّتَهَا...
2    وَالْهَاوَنُ مِثَالٌ ، فَمِثْلُهُ كُلُّ مَا يَ...
3    وَيَرُدُّ عَلَيْهِ أَنَّ فَاقِدَ الطَّهُورَيْن...
4    كُهَيْلٍ قَالَ سَمِعْتُ أَبِي يُحَدِّثُ عَنْ ح...
Name: 0, dtype: object


100%|██████████| 2500/2500 [00:00<00:00, 4667.90it/s]


In [11]:
# test dataset too
test_dataset = read_dataset('./test_no_diacritics.txt', verbose=True)
X_words, X, Y = xy_dataset(test_dataset, is_test=True)
convert_to_gold_standard_format(X, Y, name='test_no_diacritics')
save_dataset(X, Y, X_words, x_file='X_test_no_diacritics.csv', y_file='Y_test_no_diacritics.csv', x_words_file='test_no_diacritics.txt')

dataset shape:  (2500,)
first 5 examples: 
0    ليس للوكيل بالقبض أن يبرأ المدين أو يهب الدين ...
1    ( قوله ويقع في بعض النسخ بمنفعة ومعين ) أي : أ...
2                   وما ثبت بظني ساقط من قسم المعلوم .
3    26093 - حدثنا عثمان بن عمر قال حدثنا مالك عن ا...
4    لأن الحمل فيها لا يفيد أمية الولد ( فإن كانت ا...
Name: 0, dtype: object


100%|██████████| 2500/2500 [00:00<00:00, 5601.57it/s]


In [12]:


test_dataset = read_dataset('test_no_harakat.txt', verbose=True)
test_no_dataset = read_dataset('test.txt', verbose=True)
print(len(test_dataset))

print(len(test_no_dataset))
X_words, X, Y = xy_dataset(test_dataset, is_test=True)
X_words, XX, YY = xy_dataset(test_no_dataset, is_test=True)
print(XX[22] == X[22])
print(len(X))
print(len(XX))
print(XX[22])
print(X[22])
convert_to_gold_standard_format(X, Y, name='test_no_harakat')
convert_to_gold_standard_format(XX, YY, name='test')

save_dataset(X, Y, X_words, x_file='X_test_no_harakat.csv', y_file='Y_test_no_harakat.csv', x_words_file='X_words_test_no_harkat.txt')

dataset shape:  (2500,)
first 5 examples: 
0    ( قوله : ولو ادعى ولد أمة مشتركة ثبت نسبه ، وه...
1    قوله : ( وبحث الرافعي صحتها ) وإن قصد تمليك ال...
2    والهاون مثال ، فمثله كل ما يتعذر كسره على رأسها .
3    ويرد عليه أن فاقد الطهورين ونحوه ليس له صلاة إ...
4    كهيل قال سمعت أبي يحدث عن حبة العرني قال رأيت ...
Name: 0, dtype: object
dataset shape:  (2500,)
first 5 examples: 
0    ( قَوْلُهُ : وَلَوْ ادَّعَى وَلَدَ أَمَةٍ مُشْ...
1    قَوْلُهُ : ( وَبَحَثَ الرَّافِعِيُّ صِحَّتَهَا...
2    وَالْهَاوَنُ مِثَالٌ ، فَمِثْلُهُ كُلُّ مَا يَ...
3    وَيَرُدُّ عَلَيْهِ أَنَّ فَاقِدَ الطَّهُورَيْن...
4    كُهَيْلٍ قَالَ سَمِعْتُ أَبِي يُحَدِّثُ عَنْ ح...
Name: 0, dtype: object
2500
2500


100%|██████████| 2500/2500 [00:00<00:00, 5112.93it/s]
100%|██████████| 2500/2500 [00:00<00:00, 3935.78it/s]


True
11262
11262
[' ', 'أ', 'ي', ' ', 'ا', 'ل', 'ن', 'ا', 'ص', 'ي', 'ة', ' ', 'و', 'ذ', 'ك', 'ر', 'ه', ' ', 'م', 'ر', 'ا', 'ع', 'ا', 'ة', ' ', 'ل', 'ل', 'خ', 'ب', 'ر', ' ', 'و', 'ه', 'و', ' ', 'ق', 'و', 'ل', 'ه', ' ', 'م', 'ق', 'د', 'م', ' ', '،', ' ', 'و', 'ي', 'ج', 'و', 'ز', ' ', 'ت', 'أ', 'ن', 'ي', 'ث', 'ه', ' ', 'أ', 'ي', 'ض', 'ا', ' ', '،', ' ', 'و', 'ا', 'ل', 'ت', 'ذ', 'ك', 'ي', 'ر', ' ', 'ه', 'ن', 'ا', ' ', 'أ', 'و', 'ل', 'ى', ' ']
[' ', 'أ', 'ي', ' ', 'ا', 'ل', 'ن', 'ا', 'ص', 'ي', 'ة', ' ', 'و', 'ذ', 'ك', 'ر', 'ه', ' ', 'م', 'ر', 'ا', 'ع', 'ا', 'ة', ' ', 'ل', 'ل', 'خ', 'ب', 'ر', ' ', 'و', 'ه', 'و', ' ', 'ق', 'و', 'ل', 'ه', ' ', 'م', 'ق', 'د', 'م', ' ', '،', ' ', 'و', 'ي', 'ج', 'و', 'ز', ' ', 'ت', 'أ', 'ن', 'ي', 'ث', 'ه', ' ', 'أ', 'ي', 'ض', 'ا', ' ', '،', ' ', 'و', 'ا', 'ل', 'ت', 'ذ', 'ك', 'ي', 'ر', ' ', 'ه', 'ن', 'ا', ' ', 'أ', 'و', 'ل', 'ى', ' ']


In [13]:
def merge_all_text(file_names: list[str]):
    """
    takes a list of file names and merge them into one text file
    """
    with open(f'{CLEAN_OUT_FOLDER}/merged.txt', 'w', encoding="utf8") as outfile:
        for fname in file_names:
            with open(fname, encoding="utf8") as infile:
                for line in infile:
                    outfile.write(line)


cln_train = os.path.join(CLEAN_OUT_FOLDER, 'X_words.txt')
cln_val = os.path.join(CLEAN_OUT_FOLDER, 'X_words_val.txt')
merge_all_text([cln_train, cln_val])

In [14]:
# from py import test

train = read_dataset()
val = read_dataset('val.txt')
test = read_dataset('test.txt')
with open(f'{CLEAN_OUT_FOLDER}/merged_unsplited.txt', 'w', encoding="utf8") as infile:
    trainfile = open(f'{CLEAN_OUT_FOLDER}/train_unsplited.txt', 'w', encoding="utf8")
    valfile = open(f'{CLEAN_OUT_FOLDER}/val_unsplited.txt', 'w', encoding="utf8")
    testfile = open(f'{CLEAN_OUT_FOLDER}/test_unsplited.txt', 'w', encoding="utf8")
    # keep only arabic letters and spaces
    for t in train:
        ll = ''.join([c for c in t if c in ARABIC_LETTERS or c == ' '])
        ll = re.sub(r'\s+', ' ', ll) + '\n'
        trainfile.write(ll)
        infile.write(ll)

    for t in val:
        ll = ''.join([c for c in t if c in ARABIC_LETTERS or c == ' '])
        ll = re.sub(r'\s+', ' ', ll) + '\n'
        valfile.write(ll)
        infile.write(ll)

    for t in test:
        ll = ''.join([c for c in t if c in ARABIC_LETTERS or c == ' '])
        ll = re.sub(r'\s+', ' ', ll) + '\n'
        testfile.write(ll)
        infile.write(ll)



In [15]:
print(harakat2id)

with open(f'{DATASET_FOLDER}/test.txt', 'r', encoding="utf8") as infile:
    # remove all the harakat from the test file
    with open(f'{CLEAN_OUT_FOLDER}/test_no_harakat.txt', 'w', encoding="utf8") as outfile:
        for line in infile:
            line = ''.join([c for c in line if c not in harakat2id])
            outfile.write(line)


{'َ': 0, 'ً': 1, 'ُ': 2, 'ٌ': 3, 'ِ': 4, 'ٍ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ': 9, 'ُّ': 10, 'ٌّ': 11, 'ِّ': 12, 'ٍّ': 13, '': 14}


In [16]:
from nltk.stem.arlstem import ARLSTem

stemmer = ARLSTem()
stemmer.stem('المدرسة')




'مدرس'