### Quaran Form Analysis

In [1]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper

In [2]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text = file.read()

#### Form Tag

In [3]:
form_tag_list = re.findall("[\(\d*:\d*:\d*:\d*\)][\t](\D*)[\t][\D*]", text)

In [4]:
form_tag_num = re.findall("(\d*:\d*:\d*:\d*)", text)

In [5]:
df_word = pd.DataFrame(form_tag_num)
df_word.rename(columns={0:"loc_num"}, inplace=True)
df_word

Unnamed: 0,loc_num
0,1:1:1:1
1,1:1:1:2
2,1:1:2:1
3,1:1:3:1
4,1:1:3:2
...,...
128214,114:6:2:1
128215,114:6:2:2
128216,114:6:3:1
128217,114:6:3:2


In [6]:
df_word = df_word["loc_num"].str.split(":", n=-1, expand=True)
df_word.rename(columns={0:"num_1", 1:"num_2", 2:"num_3", 3:"num_4"}, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4
0,1,1,1,1
1,1,1,1,2
2,1,1,2,1
3,1,1,3,1
4,1,1,3,2
...,...,...,...,...
128214,114,6,2,1
128215,114,6,2,2
128216,114,6,3,1
128217,114,6,3,2


In [7]:
df_word["form_tag"] = pd.DataFrame(form_tag_list)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag
0,1,1,1,1,bi\tP
1,1,1,1,2,somi\tN
2,1,1,2,1,{ll~ahi\tPN
3,1,1,3,1,{l\tDET
4,1,1,3,2,r~aHoma`ni\tADJ
...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET
128215,114,6,2,2,jin~api\tN
128216,114,6,3,1,wa\tCONJ
128217,114,6,3,2,{l\tDET


In [8]:
df_word[["form","tag"]] = df_word["form_tag"].str.split("\t", n=-1, expand=True)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag,form,tag
0,1,1,1,1,bi\tP,bi,P
1,1,1,1,2,somi\tN,somi,N
2,1,1,2,1,{ll~ahi\tPN,{ll~ahi,PN
3,1,1,3,1,{l\tDET,{l,DET
4,1,1,3,2,r~aHoma`ni\tADJ,r~aHoma`ni,ADJ
...,...,...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET,{lo,DET
128215,114,6,2,2,jin~api\tN,jin~api,N
128216,114,6,3,1,wa\tCONJ,wa,CONJ
128217,114,6,3,2,{l\tDET,{l,DET


In [9]:
df_word.drop(["form_tag"], axis=1, inplace=True)

In [10]:
df_word.drop_duplicates(inplace=True)
df_word.reset_index(drop=True, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [11]:
df_word_concat = pd.DataFrame(df_word.groupby(["num_1","num_2","num_3"])["form"].sum())
df_word_concat.reset_index(inplace=True)
df_word_concat

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,99,8,2,yaEomalo
77425,99,8,3,mivoqaAla
77426,99,8,4,*ar~apK
77427,99,8,5,$ar~FA


In [None]:
#df_word_concat.to_excel("Buckwalter_Form_Concat.xlsx", sheet_name="Concat_Form", index=False)

#### Buckwalter Text

In [None]:
df_word_concat = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [20]:
df_word_concat

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,99,8,2,yaEomalo
77425,99,8,3,mivoqaAla
77426,99,8,4,*ar~apK
77427,99,8,5,$ar~FA


In [21]:
concat_list = df_word_concat.iloc[:,3].to_list()

In [23]:
#concat_list

In [26]:
buckwalter_text = " ".join(concat_list)
#buckwalter_text

#### Arabic Text

In [None]:
bw2ar = CharMapper.builtin_mapper('bw2ar')

In [None]:
arabic_text = bw2ar(buckwalter_text)