### Quaran Form Analysis

In [1]:
#!pip install lang-trans

In [1]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [2]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text = file.read()

#### Form Tag

In [3]:
form_tag_list = re.findall("[\(\d*:\d*:\d*:\d*\)][\t](\D*)[\t][\D*]", text)

In [4]:
form_tag_num = re.findall("(\d*:\d*:\d*:\d*)", text)

In [5]:
df_word = pd.DataFrame(form_tag_num)
df_word.rename(columns={0:"loc_num"}, inplace=True)
df_word

Unnamed: 0,loc_num
0,1:1:1:1
1,1:1:1:2
2,1:1:2:1
3,1:1:3:1
4,1:1:3:2
...,...
128214,114:6:2:1
128215,114:6:2:2
128216,114:6:3:1
128217,114:6:3:2


In [6]:
df_word = df_word["loc_num"].str.split(":", n=-1, expand=True)
df_word.rename(columns={0:"num_1", 1:"num_2", 2:"num_3", 3:"num_4"}, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4
0,1,1,1,1
1,1,1,1,2
2,1,1,2,1
3,1,1,3,1
4,1,1,3,2
...,...,...,...,...
128214,114,6,2,1
128215,114,6,2,2
128216,114,6,3,1
128217,114,6,3,2


In [7]:
df_word["form_tag"] = pd.DataFrame(form_tag_list)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag
0,1,1,1,1,bi\tP
1,1,1,1,2,somi\tN
2,1,1,2,1,{ll~ahi\tPN
3,1,1,3,1,{l\tDET
4,1,1,3,2,r~aHoma`ni\tADJ
...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET
128215,114,6,2,2,jin~api\tN
128216,114,6,3,1,wa\tCONJ
128217,114,6,3,2,{l\tDET


In [8]:
df_word[["form","tag"]] = df_word["form_tag"].str.split("\t", n=-1, expand=True)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag,form,tag
0,1,1,1,1,bi\tP,bi,P
1,1,1,1,2,somi\tN,somi,N
2,1,1,2,1,{ll~ahi\tPN,{ll~ahi,PN
3,1,1,3,1,{l\tDET,{l,DET
4,1,1,3,2,r~aHoma`ni\tADJ,r~aHoma`ni,ADJ
...,...,...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET,{lo,DET
128215,114,6,2,2,jin~api\tN,jin~api,N
128216,114,6,3,1,wa\tCONJ,wa,CONJ
128217,114,6,3,2,{l\tDET,{l,DET


In [9]:
df_word.drop(["form_tag"], axis=1, inplace=True)

In [10]:
df_word.drop_duplicates(inplace=True)
df_word.reset_index(drop=True, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [11]:
#df_word.to_excel("Buckwalter_Form_Tag.xlsx", sheet_name="Form_Tag", index=False)

In [12]:
df_word_concat = pd.DataFrame(df_word.groupby(["num_1","num_2","num_3"])["form"].sum())
df_word_concat.reset_index(inplace=True)
df_word_concat

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,99,8,2,yaEomalo
77425,99,8,3,mivoqaAla
77426,99,8,4,*ar~apK
77427,99,8,5,$ar~FA


In [13]:
#df_word_concat.to_excel("Buckwalter_Form_Concat.xlsx", sheet_name="Concat_Form", index=False)

#### Buckwalter Text

In [14]:
#df_word_concat = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [15]:
df_word_concat

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,99,8,2,yaEomalo
77425,99,8,3,mivoqaAla
77426,99,8,4,*ar~apK
77427,99,8,5,$ar~FA


In [75]:
#concat_list = df_word_concat.iloc[:,3].to_list()

In [16]:
concat_list = df_word_concat.iloc[:,3].head(500).to_list()
#concat_list = df_word_concat.iloc[:,3].to_list()

In [44]:
#concat_list

In [17]:
buckwalter_text = " ".join(concat_list)
#buckwalter_text

#### Arabic Text

In [78]:
# 1 Way camel_tools.utils.charmap
bw2ar = CharMapper.builtin_mapper('bw2ar')

In [51]:
#bw2ar = CharMapper.builtin_mapper('safebw2ar')

In [41]:
#bw2ar = CharMapper.builtin_mapper('hsb2ar')

In [79]:
arabic_text = bw2ar(buckwalter_text)

In [80]:
#arabic_text

In [60]:
with open("Arabic.txt", "w", encoding="utf8") as file:
    file.write(arabic_text)

In [36]:
# 2 Way lang_trans.arabic
arabic_text2 = buckwalter.untransliterate(arabic_text)

In [37]:
arabic_text2

'بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ مَٰلِكِ يَوْمِ ٱلدِّينِ إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُوبِ عَلَيْهِمْ وَلَا ٱلضَّا^لِّينَ ال^ر تِلْكَ ءَايَٰتُ ٱلْكِتَٰبِ ٱلْحَكِيمِ دَعْوَىٰهُمْ أَنِ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ فِيهَا سُبْحَٰنَكَ ٱللَّهُمَّ وَتَحِيَّتُهُمْ فِيهَا سَلَٰمٌ وَءَاخِرُ دَعْوَىٰهُمْ وَمَا ٱلرِّجْسَ عَلَى ٱلَّذِينَ لَا يَعْقِلُونَ كَانَ لِنَفْسٍ أَن تُؤْمِنَ إِلَّا بِإِذْنِ ٱللَّهِ وَيَجْعَلُ قُلِ وَٱلنُّذُرُ عَن قَوْمٍ لَّا يُؤْمِنُونَ ٱنظُرُوا@ مَاذَا فِى ٱلسَّمَٰوَٰتِ وَٱلْأَرْضِ وَمَا تُغْنِى ٱلْءَايَٰتُ فَهَلْ قُلْ فَٱنتَظِرُو^ا@ إِنِّى مَعَكُم مِّنَ ٱلْمُنتَظِرِينَ يَنتَظِرُونَ إِلَّا مِثْلَ أَيَّامِ ٱلَّذِينَ خَلَوْا@ مِن قَبْلِهِمْ ثُمَّ ٱلْمُؤْمِنِينَ نُنَجِّى رُسُلَنَا وَٱلَّذِينَ ءَامَنُوا@ كَذَٰلِكَ حَقًّا عَلَيْنَا نُنجِ قُلْ فَلَا^ أَعْبُدُ ٱلَّذِينَ تَعْبُدُونَ مِن دُونِ ٱللَّهِ وَلَ

In [21]:
tex = "{lD~aA^l~iyna Al^r tiloka \'aAya`tu {lokita`bi {loHakiymi daEowaY`humo >ani"

In [22]:
bw2ar(tex)

'ٱلضَّا^لِّينَ ال^ر تِلْكَ ءَايَٰتُ ٱلْكِتَٰبِ ٱلْحَكِيمِ دَعْوَىٰهُمْ أَنِ'

In [23]:
# 1-Way Encoding
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [24]:
print(transString(u'مرحبا'))

mrHbA


In [25]:
print(transString('mrHbA', 1))

مرحبا


In [26]:
transString("{lD~aA^l~iyna Al^r tiloka \'aAya`tu {lokita`bi {loHakiymi daEowaY`humo >ani",1)

'ٱلضَّا^لِّينَ ال^ر تِلْكَ ءَايَٰتُ ٱلْكِتَٰبِ ٱلْحَكِيمِ دَعْوَىٰهُمْ أَنِ'

In [22]:
# 2-Way Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem
}

def transString2(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [23]:
test = transString2(buckwalter_text, 1)
test

'بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ مَٰلِكِ يَوْمِ ٱلدِّينِ إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُوبِ عَلَيْهِمْ وَلَا ٱلضَّآلِّينَ الٓر تِلْكَ ءَايَٰتُ ٱلْكِتَٰبِ ٱلْحَكِيمِ دَعْوَىٰهُمْ أَنِ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ فِيهَا سُبْحَٰنَكَ ٱللَّهُمَّ وَتَحِيَّتُهُمْ فِيهَا سَلَٰمٌ وَءَاخِرُ دَعْوَىٰهُمْ وَمَا ٱلرِّجْسَ عَلَى ٱلَّذِينَ لَا يَعْقِلُونَ كَانَ لِنَفْسٍ أَن تُؤْمِنَ إِلَّا بِإِذْنِ ٱللَّهِ وَيَجْعَلُ قُلِ وَٱلنُّذُرُ عَن قَوْمٍ لَّا يُؤْمِنُونَ ٱنظُرُوا۟ مَاذَا فِى ٱلسَّمَٰوَٰتِ وَٱلْأَرْضِ وَمَا تُغْنِى ٱلْءَايَٰتُ فَهَلْ قُلْ فَٱنتَظِرُوٓا۟ إِنِّى مَعَكُم مِّنَ ٱلْمُنتَظِرِينَ يَنتَظِرُونَ إِلَّا مِثْلَ أَيَّامِ ٱلَّذِينَ خَلَوْا۟ مِن قَبْلِهِمْ ثُمَّ ٱلْمُؤْمِنِينَ نُنَجِّى رُسُلَنَا وَٱلَّذِينَ ءَامَنُوا۟ كَذَٰلِكَ حَقًّا عَلَيْنَا نُنجِ قُلْ فَلَآ أَعْبُدُ ٱلَّذِينَ تَعْبُدُونَ مِن دُونِ ٱللَّهِ وَلَ

In [25]:
with open("Arabic2.txt", "w", encoding="utf8") as file:
    file.write(test)