### Quaran Form Analysis

In [26]:
#!pip install lang-trans

In [1]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [2]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text = file.read()

#### Form Tag

In [3]:
form_tag_list = re.findall("[\(\d*:\d*:\d*:\d*\)][\t](\D*)[\t][\D*]", text)

In [4]:
form_tag_num = re.findall("(\d*:\d*:\d*:\d*)", text)

In [5]:
df_stem = pd.DataFrame(form_tag_num)
df_stem.rename(columns={0:"loc_num"}, inplace=True)
df_stem

Unnamed: 0,loc_num
0,1:1:1:1
1,1:1:1:2
2,1:1:2:1
3,1:1:3:1
4,1:1:3:2
...,...
128214,114:6:2:1
128215,114:6:2:2
128216,114:6:3:1
128217,114:6:3:2


In [6]:
df_stem = df_stem["loc_num"].str.split(":", n=-1, expand=True)
df_stem.rename(columns={0:"num_1", 1:"num_2", 2:"num_3", 3:"num_4"}, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4
0,1,1,1,1
1,1,1,1,2
2,1,1,2,1
3,1,1,3,1
4,1,1,3,2
...,...,...,...,...
128214,114,6,2,1
128215,114,6,2,2
128216,114,6,3,1
128217,114,6,3,2


In [7]:
df_stem['num_1'] = df_stem['num_1'].astype(int)
df_stem['num_2'] = df_stem['num_2'].astype(int)
df_stem['num_3'] = df_stem['num_3'].astype(int)
df_stem['num_4'] = df_stem['num_4'].astype(int)

In [8]:
df_stem["form_tag"] = pd.DataFrame(form_tag_list)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag
0,1,1,1,1,bi\tP
1,1,1,1,2,somi\tN
2,1,1,2,1,{ll~ahi\tPN
3,1,1,3,1,{l\tDET
4,1,1,3,2,r~aHoma`ni\tADJ
...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET
128215,114,6,2,2,jin~api\tN
128216,114,6,3,1,wa\tCONJ
128217,114,6,3,2,{l\tDET


In [9]:
df_stem[["form","tag"]] = df_stem["form_tag"].str.split("\t", n=-1, expand=True)
df_stem.drop(["form_tag"], axis=1, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [10]:
df_stem.drop_duplicates(inplace=True)
df_stem.reset_index(drop=True, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [11]:
#df_stem.to_excel("Buckwalter_Form_Tag.xlsx", sheet_name="Form_Tag", index=False)

In [12]:
df_word = pd.DataFrame(df_stem.groupby(["num_1","num_2","num_3"])["form"].sum())
df_word.rename(columns={"form":"form_concat"}, inplace=True)
df_word.reset_index(inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [13]:
#df_word.to_excel("Buckwalter_Form_Concat.xlsx", sheet_name="Concat_Form", index=False)

#### Buckwalter Text

In [14]:
#df_word = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [15]:
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [16]:
concat_list = df_word.iloc[:,3].to_list()
#concat_list = df_word_concat.iloc[:,3].head(500).to_list()

In [17]:
buckwalter_text = " ".join(concat_list)
#buckwalter_text

In [18]:
#with open("Buckwalter_Form_Concat.txt", "w", encoding="utf8") as file:
#    file.write(buckwalter_text)

#### Arabic Text

In [19]:
#with open("Buckwalter_Form_Concat.txt", "r", encoding="utf8") as file:
#    buckwalter_text = file.read()

In [20]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [21]:
# Simple Encoding Technique
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "{": u"\u0671", # waSla         
}

def transStringSimple(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2unisimple.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [22]:
def clean(text):
    arabic_text2 = re.sub(r"\[", "", text)
    arabic_text3 = re.sub(r"\]", "", arabic_text2)
    arabic_text4 = re.sub(r'''([@#:;,.!-+%"FNKaui~o`}PJVG])''', "", arabic_text3)
    arabic_text5 = re.sub(r"\^", " ", arabic_text4)
    return arabic_text5

In [23]:
arabic_func_text = transString(buckwalter_text, 1)
#arabic_func_text

In [24]:
with open("Arabic_Converted_Extended_Characters.txt", "w", encoding="utf8") as file:
    file.write(arabic_func_text)

##### DF Word Concat Form Convert Arabic

In [25]:
#df_word = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [26]:
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [27]:
df_word["arabic"] = df_word.iloc[:,3].apply(lambda x: transString(x, 1))
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat,arabic
0,1,1,1,bisomi,بِسْمِ
1,1,1,2,{ll~ahi,ٱللَّهِ
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ
4,1,2,1,{loHamodu,ٱلْحَمْدُ
...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ
77426,114,6,1,mina,مِنَ
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ


In [28]:
df_word["arabic_simple"] = df_word.iloc[:,3].apply(lambda x : transStringSimple(x, 1))
df_word["arabic_simple"] = df_word.iloc[:,5].apply(lambda x : clean(x))
df_word

Unnamed: 0,num_1,num_2,num_3,form_concat,arabic,arabic_simple
0,1,1,1,bisomi,بِسْمِ,بسم
1,1,1,2,{ll~ahi,ٱللَّهِ,ٱلله
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ,ٱلرحمن
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ,ٱلرحيم
4,1,2,1,{loHamodu,ٱلْحَمْدُ,ٱلحمد
...,...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ,صدور
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ,ٱلناس
77426,114,6,1,mina,مِنَ,من
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ,ٱلجنة


In [29]:
df_word = df_word[["num_1","num_2","num_3","arabic_simple","arabic","form_concat"]]
df_word.rename(columns={"form":"buckwalter"}, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,arabic_simple,arabic,form_concat
0,1,1,1,بسم,بِسْمِ,bisomi
1,1,1,2,ٱلله,ٱللَّهِ,{ll~ahi
2,1,1,3,ٱلرحمن,ٱلرَّحْمَٰنِ,{lr~aHoma`ni
3,1,1,4,ٱلرحيم,ٱلرَّحِيمِ,{lr~aHiymi
4,1,2,1,ٱلحمد,ٱلْحَمْدُ,{loHamodu
...,...,...,...,...,...,...
77424,114,5,4,صدور,صُدُورِ,Suduwri
77425,114,5,5,ٱلناس,ٱلنَّاسِ,{ln~aAsi
77426,114,6,1,من,مِنَ,mina
77427,114,6,2,ٱلجنة,ٱلْجِنَّةِ,{lojin~api


In [30]:
df_word.iloc[27,:]

num_1                1
num_2                7
num_3                8
arabic_simple      ولا
arabic           وَلَا
form_concat      walaA
Name: 27, dtype: object

In [31]:
df_word.to_excel("Arabic_And_Simple_Word.xlsx", sheet_name="Arabic_And_Simple", index=False, encoding="utf-8")

##### Arabic Frequency

In [32]:
#df_word = pd.read_excel("Arabic_And_Simple_Word.xlsx")

In [33]:
df_arabic_count = pd.DataFrame(df_word.iloc[:,4].value_counts(ascending=False))
df_arabic_count.reset_index(inplace=True)
df_arabic_count.rename(columns={"index":"arabic","arabic":"frequency"}, inplace=True)
df_arabic_count

Unnamed: 0,arabic,frequency
0,فِى,1098
1,ٱللَّهِ,828
2,ٱلَّذِينَ,810
3,ٱللَّهُ,733
4,مِن,728
...,...,...
18988,يُنكِرُ,1
18989,أُشْرِكَ,1
18990,مَـَٔابِ,1
18991,وَذُرِّيَّةً,1


In [34]:
#df_arabic_count.to_excel("Arabic_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Frequency", index=False)

In [35]:
df_arabic_simple_count = pd.DataFrame(df_word.iloc[:,3].value_counts(ascending=False))
df_arabic_simple_count.reset_index(inplace=True)
df_arabic_simple_count.rename(columns={"index":"arabic_simple","arabic_simple":"frequency"}, inplace=True)
df_arabic_simple_count

Unnamed: 0,arabic_simple,frequency
0,من,2763
1,ٱلله,2153
2,فى,1099
3,إن,966
4,ما,873
...,...,...
15280,ٱلفتية,1
15281,وهي,1
15282,فضربنا,1
15283,حولك,1


In [None]:
#df_arabic_simple_count.to_excel("Arabic_Simple_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)