In [67]:
DATASET_FOLDER = 'dataset'
CLEAN_OUT_FOLDER = 'clean_out'
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Autoreload
%load_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
# read dataset train 
def read_dataset(file_name = 'train.txt' ,verbose=False):
    """
    takes a file name and returns a pandas dataframe of the dataset 
    the returnes is a pandas series of the dataset 
    """
    
    dataset = os.path.join(DATASET_FOLDER, file_name)
    # read dataset as a text file and each line as a training example
    dataset = pd.read_csv(dataset, sep='\t', header=None).squeeze('columns')
    
    if verbose:
        print('dataset shape: ', dataset.shape)
        print('first 5 examples: ')
        print(dataset[:5])
    
    return dataset

read_dataset(verbose=True);

dataset shape:  (50000,)
first 5 examples: 
0    قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَ...
1    ابْنُ عَرَفَةَ : قَوْلُهُ : بِلَفْظٍ يَقْتَضِي...
2    ( قَوْلُهُ لِعَدَمِ مَا تَتَعَلَّقُ إلَخْ ) أَ...
3                       وَحَيَوَانٌ غَيْرُ مَوْجُودٍ .
4    فَائِدَةٌ : قَالَ بَعْضُهُمْ : يُؤْخَذُ مِنْ ش...
Name: 0, dtype: object


In [69]:
text = 'تَغْتَرَّ'
print(len(text))

# shadda and other diacritics are single characters
for i in range(len(text)):
    print(text[i], ord(text[i]))
    
print('--------------------')

print("يّيلعبُُُ")


9
ت 1578
َ 1614
غ 1594
ْ 1618
ت 1578
َ 1614
ر 1585
ّ 1617
َ 1614
--------------------
يّيلعبُُُ


In [70]:
# geet all the tashkeel in the text 
from constants import ARABIC_LETTERS 
All_NON_Letters = set()
for text in read_dataset():
    for c in text:
        if c not in ARABIC_LETTERS:
            All_NON_Letters.add(c)
            
print(All_NON_Letters)

{':', '8', '،', '»', '7', '~', '.', ']', 'ً', '!', '\u200f', 'ٍ', '؟', '[', ')', '5', '4', '3', '9', 'ُ', '*', '«', '(', '"', '–', '`', '0', '1', ',', '{', '}', 'ٌ', 'ْ', '2', ';', "'", '/', '-', '؛', 'ِ', '6', ' ', 'ّ', 'َ'}


In [71]:
from regex import P
from constants.arabic import HARAKAT
from constants import PUNCTUATIONS
from constants.arabic import ARABIC_NUMBERS

import re
import re
from constants.arabic import ARABIC_NUMBERS
from constants import PUNCTUATIONS
# Compile regular expressions
number_regex = re.compile(r'[0-9]')
arabic_number_regex = re.compile(r'[' + "".join(ARABIC_NUMBERS) + ']')
punctuation_regex = re.compile(r'[' + "".join(PUNCTUATIONS) + ']')
spaces_regex = re.compile(r'\s+')

def clean_text(text):
    # Delete all numbers and punctuation
    # Remove numbers
    text = number_regex.sub('', text)
    text = arabic_number_regex.sub('', text)
    # Remove punctuation marks
    # text = punctuation_regex.sub('', text)
    # Compress all spaces
    text = spaces_regex.sub(' ', text)
    
    text = text.strip()
    
    return text

text = '   0110ز؟:.+_@#$.زتَغْتَرَّ2//،،312)()( جداى آيئءؤرلاىةوزظلآج)'
print(clean_text(text))

dataset = read_dataset()

for t in dataset[:5]:
    print(t)
    print(clean_text(t))
    print('---------------------')
    break

non_letters = ' '.join(list(All_NON_Letters))
print(non_letters)
cleaned = clean_text(non_letters)
print(cleaned) # only tashkeel  and spaces left -> for now 
# print(PUNCTUATIONS)

ز؟:.+_@#$.زتَغْتَرَّ//،،)()( جداى آيئءؤرلاىةوزظلآج)
قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( 14 / 123 )
قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ( / )
---------------------
: 8 ، » 7 ~ . ] ً ! ‏ ٍ ؟ [ ) 5 4 3 9 ُ * « ( " – ` 0 1 , { } ٌ ْ 2 ; ' / - ؛ ِ 6   ّ َ
: ، » ~ . ] ً ! ‏ ٍ ؟ [ ) ُ * « ( " – ` , { } ٌ ْ ; ' / - ؛ ِ ّ َ


In [72]:
# Create X and Y to the model
# X is the input and Y is the output (the target)
# X is a list of all the characters in the dataset
# Y is a list of all Diacritics in the dataset (the target)
# if the character has no diacritic, the diacritic is set to be $ (empty diacritic)
from constants.arabic import HARAKAT,SHADDA
from tqdm import tqdm
train = read_dataset()


def xy_dataset(dataset):
    """
    dataset: pandas series of the dataset (each example is a string)
    return: X, Y as lists
    """
    X = []
    X_words = []
    Y = []
    
    for line in tqdm(dataset):
        cleaned_line = clean_text(line)
        delimiters = [",", "|", ";","؛",".", "(" , ")","<",">","[","]","{","}"]
        
        splited_lines = re.split('|'.join(map(re.escape, delimiters)), cleaned_line)
        for cleaned_line in splited_lines:
            if len(cleaned_line) < 4:
                continue
            
            line_x = []
            line_y = []
            i = 0
            while i < len(cleaned_line):
                c = cleaned_line[i]
                line_x.append(c)
                
                # if this is the last character in the line or the next character is not a tashkeel 
                # then the tashkeel is empty
                if i == len(cleaned_line) - 1 or cleaned_line[i+1] not in HARAKAT:
                    line_y.append('$')
                    i += 1
                    continue
                
                i += 1
                tashkeel = cleaned_line[i]
                # if this is a shadda, we need to add the next character to the tashkeel
                # as shadda  ّ dont come alone
                if tashkeel == SHADDA and( i < len(cleaned_line) -1 and cleaned_line[i+1] in HARAKAT):
                    i += 1
                    tashkeel += cleaned_line[i]
                
                line_y.append(tashkeel)
                
                i+=1
            # add words to X_words
            X_words.append(''.join(line_x).split())
            X.append(line_x)
            Y.append(line_y)

    return X_words, X, Y
train_set = read_dataset()
X_words , X, Y = xy_dataset(train_set)

# make sure they are the same length (each x has a y of the same length)
for x, y in zip(X, Y):
    assert len(x) == len(y)

print(X_words[544])
print(''.join(X[544]))
print(Y[544])


                
            
# save outs into 3 files in CLEAN_OUT_FOLDER   

def save_dataset(X, Y, X_words, x_file='X.csv', y_file='Y.csv', x_words_file='X_words.txt'):
    """
    save X, Y, X_words into 3 files in CLEAN_OUT_FOLDER
    """
    # make sure the folder exists
    if not os.path.exists(CLEAN_OUT_FOLDER):
        os.makedirs(CLEAN_OUT_FOLDER)
    # save X
    with open(os.path.join(CLEAN_OUT_FOLDER, x_file), 'w') as f:
        for line in X:
            f.write('s'.join(line) + '\n')
    
    # save Y
    with open(os.path.join(CLEAN_OUT_FOLDER, y_file), 'w') as f:
        for line in Y:
            f.write('s'.join(line) + '\n')
    
    # save X_words
    with open(os.path.join(CLEAN_OUT_FOLDER, x_words_file), 'w') as f:
        for line in X_words:
            f.write(' '.join(line) + '\n')
    
save_dataset(X, Y, X_words)


    



100%|██████████| 50000/50000 [00:15<00:00, 3295.89it/s]


['أي', ':', 'وبعد', 'ارتفاع', 'العقد', 'يملك', 'العامل', 'الربح', 'المشروط', 'له']
 أي : وبعد ارتفاع العقد يملك العامل الربح المشروط له 
['$', 'َ', 'ْ', '$', '$', '$', 'َ', 'َ', 'ْ', 'َ', '$', '$', 'ْ', 'ِ', 'َ', '$', 'ِ', '$', '$', 'ْ', 'َ', 'ْ', 'ِ', '$', 'َ', 'ْ', 'ِ', 'ُ', '$', '$', 'ْ', 'َ', '$', 'ِ', 'ُ', '$', '$', '$', 'ِّ', 'ْ', 'َ', '$', '$', 'ْ', 'َ', 'ْ', 'ُ', '$', 'َ', '$', 'َ', 'ُ', '$']


In [73]:
# print them side by side 
print(len(X[544]))
print(len(Y[544]))
for x, y in zip(X[544], Y[544]):
    print(x, y)

53
53
  $
أ َ
ي ْ
  $
: $
  $
و َ
ب َ
ع ْ
د َ
  $
ا $
ر ْ
ت ِ
ف َ
ا $
ع ِ
  $
ا $
ل ْ
ع َ
ق ْ
د ِ
  $
ي َ
م ْ
ل ِ
ك ُ
  $
ا $
ل ْ
ع َ
ا $
م ِ
ل ُ
  $
ا $
ل $
ر ِّ
ب ْ
ح َ
  $
ا $
ل ْ
م َ
ش ْ
ر ُ
و $
ط َ
  $
ل َ
ه ُ
  $


In [74]:
# val dataset too 
# val_dataset_f = os.path.join(DATASET_FOLDER, 'val.txt')
# read dataset as a text file and each line as a training example
val_dataset = read_dataset('val.txt', verbose=True)
X_words , X, Y = xy_dataset(val_dataset)
save_dataset(X, Y, X_words, x_file='X_val.csv', y_file='Y_val.csv', x_words_file='X_words_val.txt')


dataset shape:  (2500,)
first 5 examples: 
0       ( 27 ) قَوْلُهُ : وَلَا تُكْرَهُ ضِيَافَتُهُ .
1    ( الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ ...
2    ( قَوْلُهُ : وَهُوَ ) أَيْ : الْبَيْعُ بِالْمَ...
3    وَالْعَفْوُ قَبْلَ الْإِمَامِ ، أَوْ بَعْدَهُ ...
4    ( قَوْلُهُ : وَرِبْحُهُ ) أَيْ الْقِرَاضِ وَقَ...
Name: 0, dtype: object


 26%|██▌       | 643/2500 [00:00<00:00, 3230.37it/s]

100%|██████████| 2500/2500 [00:00<00:00, 3246.05it/s]


In [75]:
def merge_all_text(file_names: list[str]):
    """
    takes a list of file names and merge them into one text file
    """
    with open(f'{CLEAN_OUT_FOLDER}/merged.txt', 'w') as outfile:
        for fname in file_names:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)
                    
cln_train = os.path.join(CLEAN_OUT_FOLDER, 'X_words.txt')
cln_val = os.path.join(CLEAN_OUT_FOLDER, 'X_words_val.txt')    
merge_all_text([cln_train, cln_val])

In [76]:
from nltk.stem.arlstem import ARLSTem

stemmer = ARLSTem()
stemmer.stem('المدرسة')




'مدرس'