### Quaran Form Analysis

In [None]:
#!pip install lang-trans

In [1]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [2]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text = file.read()

#### Form Tag

In [3]:
form_tag_list = re.findall("[\(\d*:\d*:\d*:\d*\)][\t](\D*)[\t][\D*]", text)

In [4]:
form_tag_num = re.findall("(\d*:\d*:\d*:\d*)", text)

In [5]:
df_stem = pd.DataFrame(form_tag_num)
df_stem.rename(columns={0:"loc_num"}, inplace=True)
df_stem

Unnamed: 0,loc_num
0,1:1:1:1
1,1:1:1:2
2,1:1:2:1
3,1:1:3:1
4,1:1:3:2
...,...
128214,114:6:2:1
128215,114:6:2:2
128216,114:6:3:1
128217,114:6:3:2


In [6]:
df_stem = df_stem["loc_num"].str.split(":", n=-1, expand=True)
df_stem.rename(columns={0:"num_1", 1:"num_2", 2:"num_3", 3:"num_4"}, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4
0,1,1,1,1
1,1,1,1,2
2,1,1,2,1
3,1,1,3,1
4,1,1,3,2
...,...,...,...,...
128214,114,6,2,1
128215,114,6,2,2
128216,114,6,3,1
128217,114,6,3,2


In [7]:
df_stem['num_1'] = df_stem['num_1'].astype(int)
df_stem['num_2'] = df_stem['num_2'].astype(int)
df_stem['num_3'] = df_stem['num_3'].astype(int)
df_stem['num_4'] = df_stem['num_4'].astype(int)

In [8]:
df_stem["form_tag"] = pd.DataFrame(form_tag_list)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag
0,1,1,1,1,bi\tP
1,1,1,1,2,somi\tN
2,1,1,2,1,{ll~ahi\tPN
3,1,1,3,1,{l\tDET
4,1,1,3,2,r~aHoma`ni\tADJ
...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET
128215,114,6,2,2,jin~api\tN
128216,114,6,3,1,wa\tCONJ
128217,114,6,3,2,{l\tDET


In [9]:
df_stem[["form","tag"]] = df_stem["form_tag"].str.split("\t", n=-1, expand=True)
df_stem.drop(["form_tag"], axis=1, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [10]:
df_stem.drop_duplicates(inplace=True)
df_stem.reset_index(drop=True, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [11]:
#df_word.to_excel("Buckwalter_Form_Tag.xlsx", sheet_name="Form_Tag", index=False)

In [12]:
df_word = pd.DataFrame(df_stem.groupby(["num_1","num_2","num_3"])["form"].sum())
df_word.reset_index(inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [13]:
#df_word.to_excel("Buckwalter_Form_Concat.xlsx", sheet_name="Concat_Form", index=False)

#### Buckwalter Text

In [14]:
#df_word_concat = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [15]:
df_word

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [16]:
concat_list = df_word.iloc[:,3].to_list()
#concat_list = df_word_concat.iloc[:,3].head(500).to_list()

In [17]:
buckwalter_text = " ".join(concat_list)
#buckwalter_text

#### Arabic Text

##### Camel Tools

In [18]:
bw2ar = CharMapper.builtin_mapper('bw2ar')

In [19]:
arabic_text = bw2ar(buckwalter_text)
#arabic_text

In [20]:
#with open("Arabic_With_Extended_Characters.txt", "w", encoding="utf8") as file:
#    file.write(arabic_text)

In [21]:
arabic_text2 = re.sub(r"\[", "", arabic_text)

In [22]:
arabic_text3 = re.sub(r"\]", "", arabic_text2)
#arabic_text3

In [23]:
arabic_text4 = re.sub(r'''([@#:;,.!-+%"])''', "", arabic_text3)
#arabic_text4

In [24]:
arabic_text5 = re.sub(r"\^", " ", arabic_text4)
#arabic_text5

In [25]:
#with open("Arabic_Without_Extended_Characters.txt", "w", encoding="utf8") as file:
#    file.write(arabic_text5)

##### Def Func

In [26]:
# 2-Way Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [27]:
# Simple Technique
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "{": u"\u0671", # waSla         
}

def transStringSimple(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2unisimple.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [28]:
def clean(text):
    arabic_text2 = re.sub(r"\[", "", text)
    arabic_text3 = re.sub(r"\]", "", arabic_text2)
    arabic_text4 = re.sub(r'''([@#:;,.!-+%"FNKaui~o`}PJVG])''', "", arabic_text3)
    arabic_text5 = re.sub(r"\^", " ", arabic_text4)
    return arabic_text5

In [None]:
# Simple Technique2
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "{": u"\u0671", # waSla         
}

def transStringSimple2(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2unisimple.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [None]:
def clean2(text):
    arabic_text2 = re.sub(r"\[", "", text)
    arabic_text3 = re.sub(r"\]", "", arabic_text2)
    arabic_text4 = re.sub(r'''([@#:;,.!-+%"FNKaui~o`}PJVG])''', "", arabic_text3)
    arabic_text5 = re.sub(r"\^", " ", arabic_text4)
    return arabic_text5

In [29]:
arabic_func_text = transString(buckwalter_text, 1)
#arabic_func_text

In [30]:
#with open("Arabic_Converted_Extended_Characters.txt", "w", encoding="utf8") as file:
#    file.write(arabic_func_text)

##### DF Word Concat Form Convert Arabic

In [31]:
#df_word_concat = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [32]:
df_word

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [33]:
df_word["arabic"] = df_word.iloc[:,3].apply(lambda x: transString(x, 1))
df_word

Unnamed: 0,num_1,num_2,num_3,form,arabic
0,1,1,1,bisomi,بِسْمِ
1,1,1,2,{ll~ahi,ٱللَّهِ
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ
4,1,2,1,{loHamodu,ٱلْحَمْدُ
...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ
77426,114,6,1,mina,مِنَ
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ


In [34]:
df_word["arabic_simple"] = df_word.iloc[:,3].apply(lambda x : transStringSimple(x, 1))
df_word["arabic_simple"] = df_word.iloc[:,5].apply(lambda x : clean(x))
df_word

Unnamed: 0,num_1,num_2,num_3,form,arabic,arabic_simple
0,1,1,1,bisomi,بِسْمِ,بسم
1,1,1,2,{ll~ahi,ٱللَّهِ,ٱلله
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ,ٱلرحمن
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ,ٱلرحيم
4,1,2,1,{loHamodu,ٱلْحَمْدُ,ٱلحمد
...,...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ,صدور
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ,ٱلناس
77426,114,6,1,mina,مِنَ,من
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ,ٱلجنة


In [35]:
df_word = df_word[["num_1","num_2","num_3","arabic_simple","arabic","form"]]
df_word.rename(columns={"form":"buckwalter"}, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,arabic_simple,arabic,buckwalter
0,1,1,1,بسم,بِسْمِ,bisomi
1,1,1,2,ٱلله,ٱللَّهِ,{ll~ahi
2,1,1,3,ٱلرحمن,ٱلرَّحْمَٰنِ,{lr~aHoma`ni
3,1,1,4,ٱلرحيم,ٱلرَّحِيمِ,{lr~aHiymi
4,1,2,1,ٱلحمد,ٱلْحَمْدُ,{loHamodu
...,...,...,...,...,...,...
77424,114,5,4,صدور,صُدُورِ,Suduwri
77425,114,5,5,ٱلناس,ٱلنَّاسِ,{ln~aAsi
77426,114,6,1,من,مِنَ,mina
77427,114,6,2,ٱلجنة,ٱلْجِنَّةِ,{lojin~api


In [36]:
#df_word.to_excel("Arabic_Simple_Word.xlsx", sheet_name="Form_Arabic_Simple", index=False)

In [37]:
df_word_count = pd.DataFrame(df_word.iloc[:,4].value_counts(ascending=False))
df_word_count.reset_index(inplace=True)
df_word_count.rename(columns={"index":"arabic","arabic":"frequency"}, inplace=True)
df_word_count

Unnamed: 0,arabic,frequency
0,فِى,1098
1,ٱللَّهِ,828
2,ٱلَّذِينَ,810
3,ٱللَّهُ,733
4,مِن,728
...,...,...
18988,يُنكِرُ,1
18989,أُشْرِكَ,1
18990,مَـَٔابِ,1
18991,وَذُرِّيَّةً,1


In [39]:
#df_word_count.to_excel("Arabic_Word_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Frequency", index=False)

##### Camal Tools Word Tokenize 

In [40]:
import camel_tools.tokenizers.word

In [41]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quran Simple Clean.txt", "r", encoding="utf8") as file:
    arabic_text_simple =file.read()

In [42]:
#arabic_text_simple

In [43]:
#re.findall("(\d*\|\d*\|\D*)", arabic_text_simple)
arabic_simple_list = re.findall("[\d*\d*][\|](\D*)[\n]", arabic_text_simple)
arabic_simple_list

['بسم الله الرحمن الرحيم',
 'الحمد لله رب العالمين',
 'الرحمن الرحيم',
 'مالك يوم الدين',
 'إياك نعبد وإياك نستعين',
 'اهدنا الصراط المستقيم',
 'صراط الذين أنعمت عليهم غير المغضوب عليهم ولا الضالين',
 'بسم الله الرحمن الرحيم الم',
 'ذلك الكتاب لا ريب فيه هدى للمتقين',
 'الذين يؤمنون بالغيب ويقيمون الصلاة ومما رزقناهم ينفقون',
 'والذين يؤمنون بما أنزل إليك وما أنزل من قبلك وبالآخرة هم يوقنون',
 'أولئك على هدى من ربهم وأولئك هم المفلحون',
 'إن الذين كفروا سواء عليهم أأنذرتهم أم لم تنذرهم لا يؤمنون',
 'ختم الله على قلوبهم وعلى سمعهم وعلى أبصارهم غشاوة ولهم عذاب عظيم',
 'ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين',
 'يخادعون الله والذين آمنوا وما يخدعون إلا أنفسهم وما يشعرون',
 'في قلوبهم مرض فزادهم الله مرضا ولهم عذاب أليم بما كانوا يكذبون',
 'وإذا قيل لهم لا تفسدوا في الأرض قالوا إنما نحن مصلحون',
 'ألا إنهم هم المفسدون ولكن لا يشعرون',
 'وإذا قيل لهم آمنوا كما آمن الناس قالوا أنؤمن كما آمن السفهاء ألا إنهم هم السفهاء ولكن لا يعلمون',
 'وإذا لقوا الذين آمنوا قالوا آمنا وإذ

In [44]:
ar2bw = CharMapper.builtin_mapper('ar2bw')

In [45]:
buckwalter_simple_list = []
for i in arabic_simple_list:
    buckwalter_word = ar2bw(i)
    buckwalter_simple_list.append(buckwalter_word)

In [46]:
buckwalter_simple_list

['bsm Allh AlrHmn AlrHym',
 'AlHmd llh rb AlEAlmyn',
 'AlrHmn AlrHym',
 'mAlk ywm Aldyn',
 '<yAk nEbd w<yAk nstEyn',
 'AhdnA AlSrAT Almstqym',
 'SrAT Al*yn >nEmt Elyhm gyr AlmgDwb Elyhm wlA AlDAlyn',
 'bsm Allh AlrHmn AlrHym Alm',
 '*lk AlktAb lA ryb fyh hdY llmtqyn',
 'Al*yn y&mnwn bAlgyb wyqymwn AlSlAp wmmA rzqnAhm ynfqwn',
 'wAl*yn y&mnwn bmA >nzl <lyk wmA >nzl mn qblk wbAl|xrp hm ywqnwn',
 '>wl}k ElY hdY mn rbhm w>wl}k hm AlmflHwn',
 "<n Al*yn kfrwA swA' Elyhm >>n*rthm >m lm tn*rhm lA y&mnwn",
 'xtm Allh ElY qlwbhm wElY smEhm wElY >bSArhm g$Awp wlhm E*Ab EZym',
 'wmn AlnAs mn yqwl |mnA bAllh wbAlywm Al|xr wmA hm bm&mnyn',
 'yxAdEwn Allh wAl*yn |mnwA wmA yxdEwn <lA >nfshm wmA y$Erwn',
 'fy qlwbhm mrD fzAdhm Allh mrDA wlhm E*Ab >lym bmA kAnwA yk*bwn',
 'w<*A qyl lhm lA tfsdwA fy Al>rD qAlwA <nmA nHn mSlHwn',
 '>lA <nhm hm Almfsdwn wlkn lA y$Erwn',
 "w<*A qyl lhm |mnwA kmA |mn AlnAs qAlwA >n&mn kmA |mn AlsfhA' >lA <nhm hm AlsfhA' wlkn lA yElmwn",
 'w<*A lqwA Al*yn |mnwA qAlwA |mnA w<*

In [47]:
buckwalter_simple_text = " ".join(buckwalter_simple_list)

In [48]:
#with open("Buckwalter_Simple_Text.txt", "w", encoding="utf8") as file:
#    file.write(buckwalter_simple_text)

In [49]:
bw2ar = CharMapper.builtin_mapper('bw2ar')

In [50]:
arabic_simple_text = bw2ar(buckwalter_simple_text)
#arabic_simple_text

In [51]:
#arabic_simple_text2=transString(buckwalter_simple_text, 1)

In [52]:
#with open("Arabic_Simple_Text.txt", "w", encoding="utf8") as file:
#    file.write(arabic_simple_text)

In [53]:
simple_words = camel_tools.tokenizers.word.simple_word_tokenize(arabic_simple_text)

In [54]:
df_simple = pd.DataFrame(simple_words, columns=["arabic_simple"])
df_simple

Unnamed: 0,arabic_simple
0,بسم
1,الله
2,الرحمن
3,الرحيم
4,الحمد
...,...
78323,=
78324,=
78325,=
78326,=


In [55]:
df_simple_count = pd.DataFrame(df_simple.iloc[:,0].value_counts(ascending=False))
df_simple_count.reset_index(inplace=True)
df_simple_count.rename(columns={"index":"arabic_simple","arabic_simple":"frequency"}, inplace=True)
df_simple_count

Unnamed: 0,arabic_simple,frequency
0,من,2763
1,الله,2265
2,في,1185
3,ما,1013
4,إن,966
...,...,...
14874,غربت,1
14875,وشاورهم,1
14876,تزاور,1
14877,طلعت,1


In [56]:
#df_simple_count.to_excel("Arabic_Word_Frequency_From_Simple.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)

In [57]:
extended_words = camel_tools.tokenizers.word.simple_word_tokenize(arabic_func_text)

In [58]:
df_extended = pd.DataFrame(extended_words, columns=["arabic_extended"])
df_extended

Unnamed: 0,arabic_extended
0,بِسْمِ
1,ٱللَّهِ
2,ٱلرَّحْمَٰنِ
3,ٱلرَّحِيمِ
4,ٱلْحَمْدُ
...,...
77425,صُدُورِ
77426,ٱلنَّاسِ
77427,مِنَ
77428,ٱلْجِنَّةِ


In [59]:
df_extended_count = pd.DataFrame(df_extended.iloc[:,0].value_counts(ascending=False))
df_extended_count.reset_index(inplace=True)
df_extended_count.rename(columns={"index":"arabic_extended","arabic_extended":"frequency"}, inplace=True)
df_extended_count

Unnamed: 0,arabic_extended,frequency
0,فِى,1098
1,ٱللَّهِ,828
2,ٱلَّذِينَ,810
3,ٱللَّهُ,733
4,مِن,728
...,...,...
18989,أَسْمَآءً,1
18990,أَمَّآ,1
18991,أَحَدُكُمَا,1
18992,فَيَسْقِى,1


In [60]:
#df_extended_count.to_excel("Arabic_Word_Frequency_From_Simple.xlsx", sheet_name="Arabic_Extended_Frequency", index=False)

In [61]:
set_extend = set(df_extended_count.iloc[:,0])

In [62]:
set_word = set(df_word_count.iloc[:,0])

In [63]:
set_extend.difference(set_word)

{'إِلْ', 'يَاسِينَ'}

In [64]:
set_word.difference(set_extend)

{'إِلْ يَاسِينَ'}