### First we read the data and clean it

Imports

In [73]:
# !pip install pyarabic
# !pip install qalsadi
# !pip install farasapy
# !pip install diacritization_evaluation
# !pip install nltl
# !pip install tensorflow

In [74]:
# !pip install pytorch

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pyarabic.araby import strip_tashkeel


Helper Functions

In [76]:
def replace_unicode_sequences(arabic_words: list) -> list:
    res = []
    for word in arabic_words:
        new_word = word
        if '\u0651\u064B' in new_word:
            new_word = (re.sub(r'\u0651\u064B', '١', new_word))
        if '\u0651\u064C' in new_word:
            new_word = (re.sub(r'\u0651\u064C', '٢', new_word))
        if '\u0651\u064D' in new_word:
            new_word = (re.sub(r'\u0651\u064D', '٣', new_word))
        if '\u0651\u064E' in new_word:
            new_word = (re.sub(r'\u0651\u064E', '٤', new_word))
        if '\u0651\u064F' in new_word:
            new_word = (re.sub(r'\u0651\u064F', '٥', new_word))
        if '\u0651\u0650' in new_word:
            new_word = (re.sub(r'\u0651\u0650', '٦', new_word))
        res.append(new_word)
    return res

def replace_unicode_sequences_in_sentence(sentence: str):
    words = sentence.split()
    replaced_words = replace_unicode_sequences(words)
    return ' '.join(replaced_words)


In [77]:
def segment_words(words: list) -> list:
    res = []
    for word in words:
        if len(word) == 1:
            res.append('S')
        else:
            new_word = ''
            new_word += 'B'
            for i in range(1, len(word)-1):
                new_word += 'I'
            new_word += 'E'
            res.append(new_word)
    return res


Setup

In [78]:
# file_path = './Dataset/train.txt'

# # Read Arabic text from the file
# with open(file_path, 'r', encoding='utf-8') as file:
#     arabic_text_from_file = file.read()

# #print(arabic_text_from_file)

# #Split the text into words
# arabic_words = arabic_text_from_file.split()

# #remove brackets, commas, dots, numbers using regex
# arabic_words = [re.sub(r'[^\u0600-\u0660\.]+', '', word) for word in arabic_words]
# stop_symbols = ['.', '،', '؟', '؛', '!', ':','...', '?.']
# arabic_words = [word for word in arabic_words if word and (len(word) > 1 or word in stop_symbols)]

In [79]:
# with open('./Dataset/words.txt', 'w', encoding='utf-8') as file:
#     for word in arabic_words:
#         file.write(word + '\n')

# #join the words into a string and split them according to sentences that end with a dot, question mark, or exclamation mark
# arabic_sentences = ' '.join(arabic_words)
# arabic_sentences = re.split(r'[\.\u061B\u061F]', arabic_sentences) 
# arabic_sentences = [sentence for sentence in arabic_sentences if sentence]

# with open('./Dataset/sentences.txt', 'w', encoding='utf-8') as file:
#     for sentence in arabic_sentences:
#         file.write(sentence + '\n')


Words without tashkeel

In [80]:
# arabic_words_without_punc = [re.sub(r'([^\u0600-\u0660\.]+)|[،\.\u061A-\u061F]+', '', word) for word in arabic_words]
# arabic_words_without_punc = [word for word in arabic_words_without_punc if word and (len(word) > 1 or word in stop_symbols)]
# arabic_words_without_punc = list((strip_tashkeel(word) for word in arabic_words_without_punc)) 

# with open('./Dataset/WordsWithoutTashkeel.txt', 'w', encoding='utf-8') as output_file:
#     for word in arabic_words_without_punc:
#         output_file.write((word) + '\n')

# with open('./Dataset/SentencesWithoutTashkeel.txt', 'w', encoding='utf-8') as output_file:
#     for sentence in arabic_sentences:
#         output_file.write(strip_tashkeel(sentence) + '\n')


In [81]:
# res = replace_unicode_sequences(arabic_words)
# print(len(res))
# with open('./Dataset/words_new_approach.txt', 'w', encoding='utf-8') as file:
#     for word in res:
#         file.write(word + '\n')

In [82]:
# #join the words into a string and split them according to sentences that end with a dot, question mark, or exclamation mark
# arabic_sentences = ' '.join(res)
# arabic_sentences = re.split(r'[\.\u061B\u061F]', arabic_sentences) 
# arabic_sentences = [sentence for sentence in arabic_sentences if sentence]

# with open('./Dataset/sentences_new_approach.txt', 'w', encoding='utf-8') as file:
#     for sentence in arabic_sentences:
#         file.write(sentence + '\n')


### REFACTORED

Cleaning Training Data

In [83]:
with open('./Dataset/training/train.txt', 'r', encoding='utf-8') as file:
    train_txt = file.read()

train_words = []
with open('./Dataset/training/train_cleaned.txt', 'w', encoding='utf-8') as cleaned_file:
    with open('./Dataset/training/train_stripped.txt', 'w', encoding='utf-8') as output_file:
        with open('./Dataset/training/train_replace.txt', 'w', encoding='utf-8') as replace_file:
            with open('./Dataset/training/train_words.txt', 'w', encoding='utf-8') as words_file:
                for sentence in train_txt.split('\n'):
                    sentence = re.sub(r'[^\u0600-\u0660 \.]+', '', sentence)
                    sentence = re.sub(r' +', ' ', sentence)
                    sentence = sentence.strip()
                    cleaned_file.write(sentence + '\n')
                    output_file.write(strip_tashkeel(sentence) + '\n')
                    replace_file.write(replace_unicode_sequences_in_sentence(sentence) + '\n')
                    for word in sentence.split():
                        word = re.sub(r'[\.؛،]', '', word)
                        word = word.strip()
                        if word:
                            train_words.append(word)
                            words_file.write(word + '\n') 

with open('./Dataset/training/train_words_replaced.txt', 'w', encoding='utf-8') as output_file:
    for word in replace_unicode_sequences(train_words):
        output_file.write(word + '\n')

Cleaning Validation Data

In [84]:
with open('./Dataset/val/val.txt', 'r', encoding='utf-8') as file:
    val_txt = file.read()

val_words = []
with open('./Dataset/val/val_cleaned.txt', 'w', encoding='utf-8') as cleaned_file:
    with open('./Dataset/val/val_stripped.txt', 'w', encoding='utf-8') as output_file:
        with open('./Dataset/val/val_replaced.txt', 'w', encoding='utf-8') as replaced_file:
            with open('./Dataset/val/val_words.txt', 'w', encoding='utf-8') as words_file:
                for sentence in val_txt.split('\n'):
                    sentence = re.sub(r'[^\u0600-\u0660 \.]+', '', sentence)
                    sentence = re.sub(r' +', ' ', sentence)
                    sentence = sentence.strip()
                    cleaned_file.write(sentence + '\n')
                    output_file.write(strip_tashkeel(sentence) + '\n')
                    replaced_file.write(replace_unicode_sequences_in_sentence(sentence) + '\n')
                    for word in sentence.split():
                        word = re.sub(r'[\.؛،]', '', word)
                        word = word.strip()
                        if word:
                            val_words.append(word)
                            words_file.write(word + '\n') 

with open('./Dataset/val/val_words_replaced.txt', 'w', encoding='utf-8') as output_file:
    for word in replace_unicode_sequences(val_words):
        output_file.write(word + '\n')

Segmenting Data

In [88]:
train_words_stripped = list((strip_tashkeel(word) for word in train_words))
with open('./Dataset/training/train_words_stripped.txt', 'w', encoding='utf-8') as output_file:
    for word in train_words_stripped:
        output_file.write(word + '\n')
val_words_stripped = list((strip_tashkeel(word) for word in val_words))
with open('./Dataset/val/val_words_stripped.txt', 'w', encoding='utf-8') as output_file:
    for word in val_words_stripped:
        output_file.write(word + '\n')
segmented_train_words = segment_words(train_words_stripped)
segmented_val_words = segment_words(val_words_stripped)

Testing Segmentation Functions

In [86]:
print(val_words[:10])
print(segmented_val_words[9::-1]) #reverse so English and Arabic align (only printing purpose)


['قَوْلُهُ', 'وَلَا', 'تُكْرَهُ', 'ضِيَافَتُهُ', 'الْفَرْقُ', 'الثَّالِثُ', 'وَالثَّلَاثُونَ', 'بَيْنَ', 'قَاعِدَةِ', 'تَقَدُّمِ']
['BIIE', 'BIIIE', 'BIE', 'BIIIIIIIE', 'BIIIIE', 'BIIIE', 'BIIIIE', 'BIIE', 'BIE', 'BIIE']


In [87]:
print(train_words_stripped[:10])
print(segmented_train_words[9::-1])

['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']
['BIIE', 'BIE', 'BIIIIIE', 'BIE', 'BIE', 'BIE', 'BIIIE', 'BIE', 'BE', 'BIIE']
