### Quaran Form Analysis

In [104]:
#!pip install lang-trans

In [105]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [106]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text = file.read()

#### Form Tag

In [107]:
form_tag_list = re.findall("[\(\d*:\d*:\d*:\d*\)][\t](\D*)[\t][\D*]", text)

In [108]:
form_tag_num = re.findall("(\d*:\d*:\d*:\d*)", text)

In [109]:
df_stem = pd.DataFrame(form_tag_num)
df_stem.rename(columns={0:"loc_num"}, inplace=True)
df_stem

Unnamed: 0,loc_num
0,1:1:1:1
1,1:1:1:2
2,1:1:2:1
3,1:1:3:1
4,1:1:3:2
...,...
128214,114:6:2:1
128215,114:6:2:2
128216,114:6:3:1
128217,114:6:3:2


In [110]:
df_stem = df_stem["loc_num"].str.split(":", n=-1, expand=True)
df_stem.rename(columns={0:"num_1", 1:"num_2", 2:"num_3", 3:"num_4"}, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4
0,1,1,1,1
1,1,1,1,2
2,1,1,2,1
3,1,1,3,1
4,1,1,3,2
...,...,...,...,...
128214,114,6,2,1
128215,114,6,2,2
128216,114,6,3,1
128217,114,6,3,2


In [111]:
df_stem['num_1'] = df_stem['num_1'].astype(int)
df_stem['num_2'] = df_stem['num_2'].astype(int)
df_stem['num_3'] = df_stem['num_3'].astype(int)
df_stem['num_4'] = df_stem['num_4'].astype(int)

In [112]:
df_stem["form_tag"] = pd.DataFrame(form_tag_list)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag
0,1,1,1,1,bi\tP
1,1,1,1,2,somi\tN
2,1,1,2,1,{ll~ahi\tPN
3,1,1,3,1,{l\tDET
4,1,1,3,2,r~aHoma`ni\tADJ
...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET
128215,114,6,2,2,jin~api\tN
128216,114,6,3,1,wa\tCONJ
128217,114,6,3,2,{l\tDET


In [113]:
df_stem[["form","tag"]] = df_stem["form_tag"].str.split("\t", n=-1, expand=True)
df_stem.drop(["form_tag"], axis=1, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [114]:
df_stem.drop_duplicates(inplace=True)
df_stem.reset_index(drop=True, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [115]:
#df_stem.to_excel("Buckwalter_Form_Tag.xlsx", sheet_name="Form_Tag", index=False)

In [116]:
df_word = pd.DataFrame(df_stem.groupby(["num_1","num_2","num_3"])["form"].sum())
df_word.rename(columns={"form":"form_concat"}, inplace=True)
df_word.reset_index(inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [117]:
#df_word.to_excel("Buckwalter_Form_Concat.xlsx", sheet_name="Concat_Form", index=False)

#### Buckwalter Text

In [118]:
#df_word = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [119]:
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [120]:
concat_list = df_word.iloc[:,3].to_list()
#concat_list = df_word_concat.iloc[:,3].head(500).to_list()

In [121]:
buckwalter_text = " ".join(concat_list)
#buckwalter_text

In [122]:
#with open("Buckwalter_Form_Concat.txt", "w", encoding="utf8") as file:
#    file.write(buckwalter_text)

#### Arabic Text

In [123]:
#with open("Buckwalter_Form_Concat.txt", "r", encoding="utf8") as file:
#    buckwalter_text = file.read()

In [124]:
# buckwalter text clean for simple before transString
def clean_ex(text):
    arabic_text2 = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio}`~"%-])''', "", text) # difference from clean
    return arabic_text2

In [125]:
# buckwalter text clean for simple before transString
def clean_ex2(text):
    #arabic_text2 = re.sub(r"`", "A", text)
    arabic_text2 = re.sub(r"{", "A", text)
    arabic_text3 = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio`~"%-])''', "", arabic_text2) # difference from clean
    return arabic_text3

In [126]:
# Fail
## buckwalter text clean for simple before transString
#def clean_ex(text):
#    arabic_text2 = re.sub(r"\[", "", text)
#    arabic_text3 = re.sub(r"\]", "", arabic_text2)
#    arabic_text4 = re.sub(r'''([@#:;,.!-+%"FNKaui~o`}PJVG])''', "", arabic_text3, re.UNICODE)
#    arabic_text5 = re.sub(r"\^", "", arabic_text4) # difference from clean
#    return arabic_text5

In [127]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [128]:
# Simple Encoding Technique
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "{": u"\u0671", # waSla         
}

def transStringSimple(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2unisimple.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [129]:
# character clean for simple after transStringSimple 
def clean(text):
    arabic_text2 = re.sub(r'''([PJVG\.:;,!\+\]\[@#FNKauio}`~"%-])''', "", text) # .$^*+
    arabic_text3 = re.sub(r"\^", " ", arabic_text2)
    return arabic_text3

In [130]:
# character clean for simple after transStringSimple 
def clean2(text):
    #arabic_text2 = re.sub(r"`", "A", text)
    arabic_text2 = re.sub(r"{", "A", text)
    arabic_text3 = re.sub(r'''([PJVG\.:;,!\+\]\[@#FNKauio`~"%-])''', "", arabic_text2) # .$^*+
    arabic_text4 = re.sub(r"\^", " ", arabic_text3)
    return arabic_text4

In [131]:
# Fail
## character clean for simple after transStringSimple 
#def clean(text):
#    arabic_text2 = re.sub(r"\[", "", text)
#    arabic_text3 = re.sub(r"\]", "", arabic_text2)
#    arabic_text4 = re.sub(r'''([@#:;,\.!-\+%"FNKaui~o`}PJVG])''', "", arabic_text3) # .$^*+
#    arabic_text5 = re.sub(r"\^", " ", arabic_text4)
#    return arabic_text5

In [132]:
test = '@SrAT# A-l*yn >nE!mt (Ely.hm gy|r A`lm^gD.wb) Pkl~JAVLGi El:yo,hm; wlA ["+Al+DAly}n,"] $ Pa`t ^wel? a%l+ya'''

In [133]:
re.sub(r'''([PJVGFNKauio@#:;,!\.\+\^\]\[}`~"%-])''', "", test, re.UNICODE) # %- must be end for accurancy 

'SrAT Al*yn >nEmt (Elyhm gy|r AlmgDwb) klAL Elyhm wlA AlDAlyn $ t wel? l+ya'

In [134]:
re.sub(r'''([PJVG\.:;,!\+@#FNKauio}`~"%-])''', "", test) # %"-

'SrAT Al*yn >nEmt (Elyhm gy|r Alm^gDwb) klAL Elyhm wlA [AlDAlyn] $ t ^wel? ly'

In [135]:
test2 = "Sira`Ta {l~a*iyna >anoEamota Ealayohimo gayori {lomagoDuwbi Ealayohimo walaA {lD~aA^l~iyna"

In [136]:
clean_ex(test2)

'SrT {l*yn >nEmt Elyhm gyr {lmgDwb Elyhm wlA {lDAlyn'

In [137]:
clean(test2)

'SrT {l*yn >nEmt Elyhm gyr {lmgDwb Elyhm wlA {lDA lyn'

In [138]:
re.sub(r'''([PJVGFNKauio@#:;,!\.\+\^\]\[}`~"%-])''', "", test2)

'SrT {l*yn >nEmt Elyhm gyr {lmgDwb Elyhm wlA {lDAlyn'

In [139]:
re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio}`~"%-])''', "", test2) 

'SrT {l*yn >nEmt Elyhm gyr {lmgDwb Elyhm wlA {lDAlyn'

In [140]:
transString(test2, 1)

'صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُوبِ عَلَيْهِمْ وَلَا ٱلضَّآلِّينَ'

In [141]:
transStringSimple(test2, 1)

'صiرa`طa ٱل~aذiينa أaنoعaمoتa عaلaيoهiمo غaيoرi ٱلoمaغoضuوبi عaلaيoهiمo وaلaا ٱلض~aا^ل~iينa'

In [142]:
arabic_func_text = transString(buckwalter_text, 1)
#arabic_func_text

In [143]:
with open("Arabic_Converted_Extended_Characters.txt", "w", encoding="utf8") as file:
    file.write(arabic_func_text)

##### DF Form Word Concat Convert Arabic

In [144]:
#df_word = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [145]:
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [146]:
df_stem["arabic_form"] = df_stem.iloc[:,4].apply(lambda x: transString(x, 1))
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag,arabic_form
0,1,1,1,1,bi,P,بِ
1,1,1,1,2,somi,N,سْمِ
2,1,1,2,1,{ll~ahi,PN,ٱللَّهِ
3,1,1,3,1,{l,DET,ٱل
4,1,1,3,2,r~aHoma`ni,ADJ,رَّحْمَٰنِ
...,...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET,ٱلْ
128215,114,6,2,2,jin~api,N,جِنَّةِ
128216,114,6,3,1,wa,CONJ,وَ
128217,114,6,3,2,{l,DET,ٱل


In [147]:
# 1 Way
df_stem["arabic_form_simple"] = df_stem.iloc[:,4].apply(lambda x: clean_ex2(x))
df_stem["arabic_form_simple"] = df_stem.iloc[:,7].apply(lambda x: transString(x, 1))
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag,arabic_form,arabic_form_simple
0,1,1,1,1,bi,P,بِ,ب
1,1,1,1,2,somi,N,سْمِ,سم
2,1,1,2,1,{ll~ahi,PN,ٱللَّهِ,الله
3,1,1,3,1,{l,DET,ٱل,ال
4,1,1,3,2,r~aHoma`ni,ADJ,رَّحْمَٰنِ,رحمن
...,...,...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET,ٱلْ,ال
128215,114,6,2,2,jin~api,N,جِنَّةِ,جنة
128216,114,6,3,1,wa,CONJ,وَ,و
128217,114,6,3,2,{l,DET,ٱل,ال


In [148]:
## 2 Way
#df_stem["arabic_form_simple"] = df_stem.iloc[:,4].apply(lambda x: transStringSimple(x, 1))
#df_stem["arabic_form_simple"] = df_stem.iloc[:,7].apply(lambda x: clean(x))
#df_stem

In [149]:
# 1 Way
df_stem["form_simple"] = df_stem.iloc[:,4].apply(lambda x: clean_ex2(x))
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag,arabic_form,arabic_form_simple,form_simple
0,1,1,1,1,bi,P,بِ,ب,b
1,1,1,1,2,somi,N,سْمِ,سم,sm
2,1,1,2,1,{ll~ahi,PN,ٱللَّهِ,الله,Allh
3,1,1,3,1,{l,DET,ٱل,ال,Al
4,1,1,3,2,r~aHoma`ni,ADJ,رَّحْمَٰنِ,رحمن,rHmn
...,...,...,...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET,ٱلْ,ال,Al
128215,114,6,2,2,jin~api,N,جِنَّةِ,جنة,jnp
128216,114,6,3,1,wa,CONJ,وَ,و,w
128217,114,6,3,2,{l,DET,ٱل,ال,Al


In [150]:
## 2 Way
##df_stem["form_simple"] = df_stem.iloc[:,7].apply(lambda x: transString(x, 0))
#df_stem["form_simple"] = df_stem.iloc[:,7].apply(lambda x: transStringSimple(x, 0))
#df_stem["form_simple"] = df_stem.iloc[:,8].apply(lambda x: clean(x))
#df_stem

In [151]:
df_stem = df_stem[["num_1","num_2","num_3","num_4","arabic_form_simple","form_simple","arabic_form","form","tag"]]
df_stem.rename(columns={"form_simple":"buckwalter_simple", "form":"buckwalter"}, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,arabic_form_simple,buckwalter_simple,arabic_form,buckwalter,tag
0,1,1,1,1,ب,b,بِ,bi,P
1,1,1,1,2,سم,sm,سْمِ,somi,N
2,1,1,2,1,الله,Allh,ٱللَّهِ,{ll~ahi,PN
3,1,1,3,1,ال,Al,ٱل,{l,DET
4,1,1,3,2,رحمن,rHmn,رَّحْمَٰنِ,r~aHoma`ni,ADJ
...,...,...,...,...,...,...,...,...,...
128214,114,6,2,1,ال,Al,ٱلْ,{lo,DET
128215,114,6,2,2,جنة,jnp,جِنَّةِ,jin~api,N
128216,114,6,3,1,و,w,وَ,wa,CONJ
128217,114,6,3,2,ال,Al,ٱل,{l,DET


In [152]:
df_stem.to_excel("Arabic_And_Simple_Form.xlsx", sheet_name="Arabic_And_Simple_Form", index=False, encoding="utf-8")

In [153]:
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [154]:
df_word["arabic"] = df_word.iloc[:,3].apply(lambda x: transString(x, 1))
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat,arabic
0,1,1,1,bisomi,بِسْمِ
1,1,1,2,{ll~ahi,ٱللَّهِ
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ
4,1,2,1,{loHamodu,ٱلْحَمْدُ
...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ
77426,114,6,1,mina,مِنَ
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ


In [155]:
# 1 Way
df_word["arabic_simple"] = df_word.iloc[:,3].apply(lambda x : clean_ex2(x))
df_word["arabic_simple"] = df_word.iloc[:,5].apply(lambda x : transString(x, 1))
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat,arabic,arabic_simple
0,1,1,1,bisomi,بِسْمِ,بسم
1,1,1,2,{ll~ahi,ٱللَّهِ,الله
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ,الرحمن
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ,الرحيم
4,1,2,1,{loHamodu,ٱلْحَمْدُ,الحمد
...,...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ,صدور
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ,الناس
77426,114,6,1,mina,مِنَ,من
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ,الجنة


In [156]:
## 2 Way
#df_word["arabic_simple"] = df_word.iloc[:,3].apply(lambda x : transStringSimple(x, 1))
#df_word["arabic_simple"] = df_word.iloc[:,5].apply(lambda x : clean(x))
#df_word

In [157]:
df_word["form_concat_simple"] = df_word.iloc[:,3].apply(lambda x: clean_ex2(x))
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat,arabic,arabic_simple,form_concat_simple
0,1,1,1,bisomi,بِسْمِ,بسم,bsm
1,1,1,2,{ll~ahi,ٱللَّهِ,الله,Allh
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ,الرحمن,AlrHmn
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ,الرحيم,AlrHym
4,1,2,1,{loHamodu,ٱلْحَمْدُ,الحمد,AlHmd
...,...,...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ,صدور,Sdwr
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ,الناس,AlnAs
77426,114,6,1,mina,مِنَ,من,mn
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ,الجنة,Aljnp


In [158]:
## 2 Way
##df_word["form_concat_simple"] = df_word.iloc[:,5].apply(lambda x: transString(x, 0))
#df_word["form_concat_simple"] = df_word.iloc[:,5].apply(lambda x: transStringSimple(x, 0))
#df_word["form_concat_simple"] = df_word.iloc[:,6].apply(lambda x: clean(x))
#df_word

In [159]:
df_word = df_word[["num_1","num_2","num_3","arabic_simple","form_concat_simple","arabic","form_concat"]]
df_word.rename(columns={"form_concat":"buckwalter","form_concat_simple":"buckwalter_simple"}, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,arabic_simple,buckwalter_simple,arabic,buckwalter
0,1,1,1,بسم,bsm,بِسْمِ,bisomi
1,1,1,2,الله,Allh,ٱللَّهِ,{ll~ahi
2,1,1,3,الرحمن,AlrHmn,ٱلرَّحْمَٰنِ,{lr~aHoma`ni
3,1,1,4,الرحيم,AlrHym,ٱلرَّحِيمِ,{lr~aHiymi
4,1,2,1,الحمد,AlHmd,ٱلْحَمْدُ,{loHamodu
...,...,...,...,...,...,...,...
77424,114,5,4,صدور,Sdwr,صُدُورِ,Suduwri
77425,114,5,5,الناس,AlnAs,ٱلنَّاسِ,{ln~aAsi
77426,114,6,1,من,mn,مِنَ,mina
77427,114,6,2,الجنة,Aljnp,ٱلْجِنَّةِ,{lojin~api


In [160]:
df_word.iloc[27,:]

num_1                    1
num_2                    7
num_3                    8
arabic_simple          ولا
buckwalter_simple      wlA
arabic               وَلَا
buckwalter           walaA
Name: 27, dtype: object

In [161]:
df_word.to_excel("Arabic_And_Simple_Word.xlsx", sheet_name="Arabic_And_Simple", index=False, encoding="utf-8")

In [162]:
df_buckwalter = df_word.iloc[:,[0,1,2,6]]
df_buckwalter

Unnamed: 0,num_1,num_2,num_3,buckwalter
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [163]:
df_buckwalter_concat = df_buckwalter.groupby(["num_1","num_2"])["buckwalter"].apply(" ".join).reset_index()
df_buckwalter_concat

Unnamed: 0,num_1,num_2,buckwalter
0,1,1,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi
1,1,2,{loHamodu lil~ahi rab~i {loEa`lamiyna
2,1,3,{lr~aHoma`ni {lr~aHiymi
3,1,4,ma`liki yawomi {ld~iyni
4,1,5,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu
...,...,...,...
6231,114,2,maliki {ln~aAsi
6232,114,3,<ila`hi {ln~aAsi
6233,114,4,min $ar~i {lowasowaAsi {loxan~aAsi
6234,114,5,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi


In [164]:
df_buckwalter_concat["arabic"] = df_buckwalter_concat.iloc[:,2].apply(lambda x: transString(x, 1)) # Convert Arabic Extended
df_buckwalter_concat

Unnamed: 0,num_1,num_2,buckwalter,arabic
0,1,1,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
1,1,2,{loHamodu lil~ahi rab~i {loEa`lamiyna,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
2,1,3,{lr~aHoma`ni {lr~aHiymi,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
3,1,4,ma`liki yawomi {ld~iyni,مَٰلِكِ يَوْمِ ٱلدِّينِ
4,1,5,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
...,...,...,...,...
6231,114,2,maliki {ln~aAsi,مَلِكِ ٱلنَّاسِ
6232,114,3,<ila`hi {ln~aAsi,إِلَٰهِ ٱلنَّاسِ
6233,114,4,min $ar~i {lowasowaAsi {loxan~aAsi,مِن شَرِّ ٱلْوَسْوَاسِ ٱلْخَنَّاسِ
6234,114,5,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi,ٱلَّذِى يُوَسْوِسُ فِى صُدُورِ ٱلنَّاسِ


In [165]:
df_buckwalter_concat["arabic_simple"] = df_buckwalter_concat.iloc[:,2].apply(lambda x : clean_ex2(x))
df_buckwalter_concat["arabic_simple"] = df_buckwalter_concat.iloc[:,4].apply(lambda x : transString(x, 1)) # Convert Arabic Simple
df_buckwalter_concat

Unnamed: 0,num_1,num_2,buckwalter,arabic,arabic_simple
0,1,1,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,بسم الله الرحمن الرحيم
1,1,2,{loHamodu lil~ahi rab~i {loEa`lamiyna,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,الحمد لله رب العلمين
2,1,3,{lr~aHoma`ni {lr~aHiymi,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,الرحمن الرحيم
3,1,4,ma`liki yawomi {ld~iyni,مَٰلِكِ يَوْمِ ٱلدِّينِ,ملك يوم الدين
4,1,5,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,إياك نعبد وإياك نستعين
...,...,...,...,...,...
6231,114,2,maliki {ln~aAsi,مَلِكِ ٱلنَّاسِ,ملك الناس
6232,114,3,<ila`hi {ln~aAsi,إِلَٰهِ ٱلنَّاسِ,إله الناس
6233,114,4,min $ar~i {lowasowaAsi {loxan~aAsi,مِن شَرِّ ٱلْوَسْوَاسِ ٱلْخَنَّاسِ,من شر الوسواس الخناس
6234,114,5,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi,ٱلَّذِى يُوَسْوِسُ فِى صُدُورِ ٱلنَّاسِ,الذى يوسوس فى صدور الناس


In [166]:
## 2 Way
#df_buckwalter_concat["arabic_simple"] = df_buckwalter_concat.iloc[:,2].apply(lambda x : transStringSimple(x, 1)) # Convert Arabic Simple
#df_buckwalter_concat["arabic_simple"] = df_buckwalter_concat.iloc[:,4].apply(lambda x : clean(x))
#df_buckwalter_concat

In [167]:
df_buckwalter_concat = df_buckwalter_concat[["num_1","num_2","arabic","buckwalter","arabic_simple"]]
df_buckwalter_concat

Unnamed: 0,num_1,num_2,arabic,buckwalter,arabic_simple
0,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi,بسم الله الرحمن الرحيم
1,1,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,{loHamodu lil~ahi rab~i {loEa`lamiyna,الحمد لله رب العلمين
2,1,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,{lr~aHoma`ni {lr~aHiymi,الرحمن الرحيم
3,1,4,مَٰلِكِ يَوْمِ ٱلدِّينِ,ma`liki yawomi {ld~iyni,ملك يوم الدين
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu,إياك نعبد وإياك نستعين
...,...,...,...,...,...
6231,114,2,مَلِكِ ٱلنَّاسِ,maliki {ln~aAsi,ملك الناس
6232,114,3,إِلَٰهِ ٱلنَّاسِ,<ila`hi {ln~aAsi,إله الناس
6233,114,4,مِن شَرِّ ٱلْوَسْوَاسِ ٱلْخَنَّاسِ,min $ar~i {lowasowaAsi {loxan~aAsi,من شر الوسواس الخناس
6234,114,5,ٱلَّذِى يُوَسْوِسُ فِى صُدُورِ ٱلنَّاسِ,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi,الذى يوسوس فى صدور الناس


In [168]:
# 1 Way
df_buckwalter_concat["buckwalter_simple"] = df_buckwalter_concat.iloc[:,3].apply(lambda x : clean_ex2(x)) # Convert Arabic Simple
df_buckwalter_concat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_buckwalter_concat["buckwalter_simple"] = df_buckwalter_concat.iloc[:,3].apply(lambda x : clean_ex2(x)) # Convert Arabic Simple


Unnamed: 0,num_1,num_2,arabic,buckwalter,arabic_simple,buckwalter_simple
0,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi,بسم الله الرحمن الرحيم,bsm Allh AlrHmn AlrHym
1,1,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,{loHamodu lil~ahi rab~i {loEa`lamiyna,الحمد لله رب العلمين,AlHmd llh rb AlElmyn
2,1,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,{lr~aHoma`ni {lr~aHiymi,الرحمن الرحيم,AlrHmn AlrHym
3,1,4,مَٰلِكِ يَوْمِ ٱلدِّينِ,ma`liki yawomi {ld~iyni,ملك يوم الدين,mlk ywm Aldyn
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu,إياك نعبد وإياك نستعين,<yAk nEbd w<yAk nstEyn
...,...,...,...,...,...,...
6231,114,2,مَلِكِ ٱلنَّاسِ,maliki {ln~aAsi,ملك الناس,mlk AlnAs
6232,114,3,إِلَٰهِ ٱلنَّاسِ,<ila`hi {ln~aAsi,إله الناس,<lh AlnAs
6233,114,4,مِن شَرِّ ٱلْوَسْوَاسِ ٱلْخَنَّاسِ,min $ar~i {lowasowaAsi {loxan~aAsi,من شر الوسواس الخناس,mn $r AlwswAs AlxnAs
6234,114,5,ٱلَّذِى يُوَسْوِسُ فِى صُدُورِ ٱلنَّاسِ,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi,الذى يوسوس فى صدور الناس,Al*Y ywsws fY Sdwr AlnAs


In [169]:
## 2 Way
##df_buckwalter_concat["buckwalter_simple"] = df_buckwalter_concat.iloc[:,4].apply(lambda x : transString(x, 0))
#df_buckwalter_concat["buckwalter_simple"] = df_buckwalter_concat.iloc[:,4].apply(lambda x : transStringSimple(x, 0)) # Convert Arabic Simple
#df_buckwalter_concat["buckwalter_simple"] = df_buckwalter_concat.iloc[:,5].apply(lambda x : clean(x))
#df_buckwalter_concat

In [181]:
df_buckwalter_concat.to_excel("Arabic_And_Simple_Sentence.xlsx", sheet_name="Arabic_Simple_Sentence", index=False)

##### Arabic Frequency

In [None]:
#df_stem = pd.read_excel("Arabic_And_Simple_Form.xlsx")

In [None]:
df_arabic_form_count = pd.DataFrame(df_stem.iloc[:,6].value_counts(ascending=False))
df_arabic_form_count.reset_index(inplace=True)
df_arabic_form_count.rename(columns={"index":"arabic_form","arabic_form":"frequency"}, inplace=True)
df_arabic_form_count

In [None]:
#df_arabic_form_count.to_excel("Arabic_Form_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Form_Frequency", index=False)

In [None]:
df_arabic_form_simple_count = pd.DataFrame(df_stem.iloc[:,4].value_counts(ascending=False))
df_arabic_form_simple_count.reset_index(inplace=True)
df_arabic_form_simple_count.rename(columns={"index":"arabic_form_simple","arabic_form_simple":"frequency"}, inplace=True)
df_arabic_form_simple_count

In [None]:
#df_arabic_form_simple_count.to_excel("Arabic_Form_Simple_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Form_Simple_Frequency", index=False)

In [None]:
#df_word = pd.read_excel("Arabic_And_Simple_Word.xlsx")

In [None]:
df_arabic_count = pd.DataFrame(df_word.iloc[:,5].value_counts(ascending=False))
df_arabic_count.reset_index(inplace=True)
df_arabic_count.rename(columns={"index":"arabic","arabic":"frequency"}, inplace=True)
df_arabic_count

In [None]:
#df_arabic_count.to_excel("Arabic_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Frequency", index=False)

In [None]:
df_arabic_simple_count = pd.DataFrame(df_word.iloc[:,3].value_counts(ascending=False))
df_arabic_simple_count.reset_index(inplace=True)
df_arabic_simple_count.rename(columns={"index":"arabic_simple","arabic_simple":"frequency"}, inplace=True)
df_arabic_simple_count

In [None]:
#df_arabic_simple_count.to_excel("Arabic_Simple_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)

##### Arabic Simple Frequency Nav

In [None]:
df_arabic_simple_select = df_arabic_simple_count.head(1000)
df_arabic_simple_select

In [None]:
df_arabic_simple_count_merge = pd.merge(df_arabic_simple_select, df_word, how="inner", on="arabic_simple")
df_arabic_simple_count_merge

In [None]:
df_arabic_simple_count_merge = df_arabic_simple_count_merge[["num_1","num_2","num_3","arabic_simple","buckwalter_simple","frequency","arabic","buckwalter"]]
df_arabic_simple_count_merge

In [None]:
df_arabic_simple_count_merge["nav"] = df_arabic_simple_count_merge["num_1"].astype("str")+":"+df_arabic_simple_count_merge["num_2"].astype("str")+":"+df_arabic_simple_count_merge["num_3"].astype("str")
df_arabic_simple_count_merge

In [None]:
#df_arabic_simple_count_merge.to_excel("Arabic_Simple_Navigation.xlsx", sheet_name="Arabic_Simple_Navigation", index=False)

In [None]:
df_arabic_simple_nav = df_arabic_simple_count_merge.groupby(["arabic_simple","buckwalter_simple","frequency"])["nav"].apply(", ".join).reset_index().sort_values(by="frequency", ascending=False)
df_arabic_simple_nav.reset_index(drop=True, inplace=True)
df_arabic_simple_nav

In [None]:
#df_arabic_simple_nav.to_excel("Arabic_Simple_Frequency_Navigation.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)

In [171]:
df_a = df_buckwalter_concat.iloc[:,[5,4]]
df_a

Unnamed: 0,buckwalter_simple,arabic_simple
0,bsm Allh AlrHmn AlrHym,بسم الله الرحمن الرحيم
1,AlHmd llh rb AlElmyn,الحمد لله رب العلمين
2,AlrHmn AlrHym,الرحمن الرحيم
3,mlk ywm Aldyn,ملك يوم الدين
4,<yAk nEbd w<yAk nstEyn,إياك نعبد وإياك نستعين
...,...,...
6231,mlk AlnAs,ملك الناس
6232,<lh AlnAs,إله الناس
6233,mn $r AlwswAs AlxnAs,من شر الوسواس الخناس
6234,Al*Y ywsws fY Sdwr AlnAs,الذى يوسوس فى صدور الناس


In [None]:
df

In [172]:
set_a = set(df_a.iloc[:,0])
set_a

{'<* njynh w>hlh >jmEyn',
 'wSdq bAlHsnY',
 "wAl*yn sEwA fY 'AytnA mEjzyn >wl}k >SHb AljHym",
 "ql >r'ytm <n jEl Allh Elykm Alyl srmdA <lY ywm Alqymp mn <lh gyr Allh y>tykm bDyA' >flA tsmEwn",
 "<* tqwl llm&mnyn >ln ykfykm >n ymdkm rbkm bvlvp 'Alf mn Alml}kp mnzlyn",
 'w<n kAnwA mn qbl >n ynzl Elyhm mn qblh lmblsyn',
 'wydxlhm Aljnp ErfhA lhm',
 'wl}n s>lthm mn xlqhm lyqwln Allh f>nY y&fkwn',
 "wAl*yn hAjrwA fY Allh mn bEd mA ZlmwA lnbw}nhm fY AldnyA Hsnp wl>jr Al'Axrp >kbr lw kAnwA yElmwn",
 'lA >qsm bh*A Albld',
 'wAlyl wmA wsq',
 'wlA tmdn Eynyk <lY mA mtEnA bh >zwjA mnhm zhrp AlHywp AldnyA lnftnhm fyh wrzq rbk xyr w>bqY',
 'fqtl kyf qdr',
 'flA tTE Almk*byn',
 'wAlqmr <*A Atsq',
 'fkfY bAllh $hydA bynnA wbynkm <n knA En EbAdtkm lgflyn',
 'wlqd k*b Al*yn mn qblhm fkyf kAn nkyr',
 "wlmA wqE Elyhm Alrjz qAlwA ymwsY AdE lnA rbk bmA Ehd Endk l}n k$ft EnA Alrjz ln&mnn lk wlnrsln mEk bnY <sr'yl",
 "y>yhA Al*yn 'AmnwA <n mn >zwjkm w>wldkm EdwA lkm fAH*rwhm w<n tEfwA wtSfHwA wtgfrwA f<n All

In [173]:
df_b = pd.read_excel("/media/kurubal/SSD/Data Scientist/Data Science/Data-Scientist/Work/Modern Ways/Arabic/ReDe/Tanzil_Simple_Buckwalter_And_Arabic.xlsx")
df_b

Unnamed: 0,buckwalter_simple,arabic_simple
0,bsm Allh AlrHmn AlrHym,بسم الله الرحمن الرحيم
1,AlHmd llh rb AlEAlmyn,الحمد لله رب العالمين
2,AlrHmn AlrHym,الرحمن الرحيم
3,mAlk ywm Aldyn,مالك يوم الدين
4,<yAk nEbd w<yAk nstEyn,إياك نعبد وإياك نستعين
...,...,...
6231,mlk AlnAs,ملك الناس
6232,<lh AlnAs,إله الناس
6233,mn $r AlwswAs AlxnAs,من شر الوسواس الخناس
6234,Al*y ywsws fy Sdwr AlnAs,الذي يوسوس في صدور الناس


In [174]:
set_b = set(df_b.iloc[:,0])
set_b

{'AdxlwA >bwAb jhnm xAldyn fyhA fb}s mvwY Almtkbryn',
 'wSdq bAlHsnY',
 'bsm Allh AlrHmn AlrHym yA >yhA Al*yn |mnwA >wfwA bAlEqwd >Hlt lkm bhymp Al>nEAm <lA mA ytlY Elykm gyr mHly AlSyd w>ntm Hrm <n Allh yHkm mA yryd',
 'qAl lA txAfA <nny mEkmA >smE w>rY',
 'w<n kAnwA mn qbl >n ynzl Elyhm mn qblh lmblsyn',
 "w<* qAlt AlmlA}kp yA mrym <n Allh ASTfAk wThrk wASTfAk ElY nsA' AlEAlmyn",
 'fyh |yAt bynAt mqAm <brAhym wmn dxlh kAn |mnA wllh ElY AlnAs Hj Albyt mn AstTAE <lyh sbylA wmn kfr f<n Allh gny En AlEAlmyn',
 'wydxlhm Aljnp ErfhA lhm',
 'wl}n s>lthm mn xlqhm lyqwln Allh f>nY y&fkwn',
 "bsm Allh AlrHmn AlrHym tbArk Al*y bydh Almlk whw ElY kl $y' qdyr",
 'fqtl kyf qdr',
 '<n Al*yn yjAdlwn fy |yAt Allh bgyr slTAn >tAhm <n fy Sdwrhm <lA kbr mA hm bbAlgyh fAstE* bAllh <nh hw AlsmyE AlbSyr',
 'flA tTE Almk*byn',
 'wAlqmr <*A Atsq',
 'qAl yA <blys mA lk >lA tkwn mE AlsAjdyn',
 "lyswA swA' mn >hl AlktAb >mp qA}mp ytlwn |yAt Allh |nA' Allyl whm ysjdwn",
 'wlqd k*b Al*yn mn qblhm fkyf kAn nkyr',


In [175]:
df_set_a = pd.DataFrame(set_a.difference(set_b))
df_set_a.rename(columns={0:"buckwalter_simple"}, inplace=True)
df_set_a

Unnamed: 0,buckwalter_simple
0,wHrm ElY qryp >hlknhA >nhm lA yrjEwn
1,<nmA AlnjwY mn Al$yTn lyHzn Al*yn 'AmnwA wlys ...
2,fATr Alsmwt wAl>rD jEl lkm mn >nfskm >zwjA wmn...
3,<* njynh w>hlh >jmEyn
4,>n lA tEbdwA <lA Allh <nY >xAf Elykm E*Ab ywm ...
...,...
4573,lw mA t>tynA bAlml}kp <n knt mn AlSdqyn
4574,mA qlt lhm <lA mA >mrtnY bh >n AEbdwA Allh rbY...
4575,A*hb >nt w>xwk b_AytY wlA tnyA fY *krY
4576,ftwlY brknh wqAl sHr >w mjnwn


In [176]:
df_a_merge = pd.merge(df_set_a, df_a, how="right", on="buckwalter_simple")
df_a_merge.drop_duplicates(inplace=True)
df_a_merge 

Unnamed: 0,buckwalter_simple,arabic_simple
0,bsm Allh AlrHmn AlrHym,بسم الله الرحمن الرحيم
1,AlHmd llh rb AlElmyn,الحمد لله رب العلمين
2,AlrHmn AlrHym,الرحمن الرحيم
3,mlk ywm Aldyn,ملك يوم الدين
4,<yAk nEbd w<yAk nstEyn,إياك نعبد وإياك نستعين
...,...,...
6231,mlk AlnAs,ملك الناس
6232,<lh AlnAs,إله الناس
6233,mn $r AlwswAs AlxnAs,من شر الوسواس الخناس
6234,Al*Y ywsws fY Sdwr AlnAs,الذى يوسوس فى صدور الناس


In [177]:
df_a_merge.to_excel("Morphology_Difference_Tanzil.xlsx", index=False)

In [178]:
df_set_b = pd.DataFrame(set_b.difference(set_a))
df_set_b.rename(columns={0:"buckwalter_simple"}, inplace=True)
df_set_b

Unnamed: 0,buckwalter_simple
0,yA >yhA Al<nsAn mA grk brbk Alkrym
1,AdxlwA >bwAb jhnm xAldyn fyhA fb}s mvwY Almtkbryn
2,bsm Allh AlrHmn AlrHym yA >yhA Al*yn |mnwA >wf...
3,f>lqwA HbAlhm wESyhm wqAlwA bEzp frEwn <nA lnH...
4,snlqy fy qlwb Al*yn kfrwA AlrEb bmA >$rkwA bAl...
...,...
4572,qAlwA fmA jzA&h <n kntm kA*byn
4573,wmn nEmrh nnksh fy Alxlq >flA yEqlwn
4574,kgly AlHmym
4575,yA bny |dm lA yftnnkm Al$yTAn kmA >xrj >bwykm ...


In [179]:
df_b_merge = pd.merge(df_set_b, df_b, how="right", on="buckwalter_simple")
df_b_merge.drop_duplicates(inplace=True)
df_b_merge 

Unnamed: 0,buckwalter_simple,arabic_simple
0,bsm Allh AlrHmn AlrHym,بسم الله الرحمن الرحيم
1,AlHmd llh rb AlEAlmyn,الحمد لله رب العالمين
2,AlrHmn AlrHym,الرحمن الرحيم
3,mAlk ywm Aldyn,مالك يوم الدين
4,<yAk nEbd w<yAk nstEyn,إياك نعبد وإياك نستعين
...,...,...
6231,mlk AlnAs,ملك الناس
6232,<lh AlnAs,إله الناس
6233,mn $r AlwswAs AlxnAs,من شر الوسواس الخناس
6234,Al*y ywsws fy Sdwr AlnAs,الذي يوسوس في صدور الناس


In [180]:
df_b_merge.to_excel("Tanzil_Difference_Morphology.xlsx", index=False)