### Quaran Tanzil Analysis

In [74]:
import pandas as pd
import numpy as np
import re
import camel_tools.tokenizers.word
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [75]:
# buckwalter text clean for simple before transString
def clean_ex(text):
    buckwalter_out = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio`~"%-])''', "", text) # difference from clean
    return buckwalter_out

In [76]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [100]:
type_file = "plain" # plain, minimal, clean

In [101]:
#type_file.lower().capitalize()

In [102]:
with open(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quran Simple {type_file.lower().capitalize()}.txt", "r", encoding="utf8") as file: # Quran Simple Clean.txt from Tanzil website
    arabic_text_simple =file.readlines()

In [103]:
df_tanzil_sent = pd.DataFrame(arabic_text_simple)
#df_tanzil_sent = df_tanzil_sent.iloc[0:6236,]
df_tanzil_sent

Unnamed: 0,0
0,1|1|بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ\n
1,1|2|الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ\n
2,1|3|الرَّحْمَٰنِ الرَّحِيمِ\n
3,1|4|مَالِكِ يَوْمِ الدِّينِ\n
4,1|5|إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ\n
...,...
6231,114|2|مَلِكِ النَّاسِ\n
6232,114|3|إِلَٰهِ النَّاسِ\n
6233,114|4|مِنْ شَرِّ الْوَسْوَاسِ الْخَنَّاسِ\n
6234,114|5|الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ\n


In [104]:
df_tanzil_sent = pd.DataFrame(df_tanzil_sent.iloc[:,0].apply(lambda x: x.strip("\n")))
df_tanzil_sent

Unnamed: 0,0
0,1|1|بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1|2|الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1|3|الرَّحْمَٰنِ الرَّحِيمِ
3,1|4|مَالِكِ يَوْمِ الدِّينِ
4,1|5|إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
...,...
6231,114|2|مَلِكِ النَّاسِ
6232,114|3|إِلَٰهِ النَّاسِ
6233,114|4|مِنْ شَرِّ الْوَسْوَاسِ الْخَنَّاسِ
6234,114|5|الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ


In [105]:
df_tanzil_sent_nav = df_tanzil_sent.iloc[:,0].str.split("|", n=-1, expand=True)
df_tanzil_sent_nav.rename(columns={0:"num_1",1:"num_2",2:f"arabic_sent_tanzil_{type_file.lower()}"}, inplace=True)
df_tanzil_sent_nav

Unnamed: 0,num_1,num_2,arabic_sent_tanzil_plain
0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1,3,الرَّحْمَٰنِ الرَّحِيمِ
3,1,4,مَالِكِ يَوْمِ الدِّينِ
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
...,...,...,...
6231,114,2,مَلِكِ النَّاسِ
6232,114,3,إِلَٰهِ النَّاسِ
6233,114,4,مِنْ شَرِّ الْوَسْوَاسِ الْخَنَّاسِ
6234,114,5,الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ


In [106]:
df_tanzil_sent_nav[f"buckwalter_sent_tanzil_{type_file.lower()}"] = df_tanzil_sent_nav.iloc[:,2].apply(lambda x: transString(x, 0))
df_tanzil_sent_nav

Unnamed: 0,num_1,num_2,arabic_sent_tanzil_plain,buckwalter_sent_tanzil_plain
0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ,bisomi All~ahi Alr~aHoma`ni Alr~aHiymi
1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,AloHamodu lil~ahi rab~i AloEaAlamiyna
2,1,3,الرَّحْمَٰنِ الرَّحِيمِ,Alr~aHoma`ni Alr~aHiymi
3,1,4,مَالِكِ يَوْمِ الدِّينِ,maAliki yawomi Ald~iyni
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu
...,...,...,...,...
6231,114,2,مَلِكِ النَّاسِ,maliki Aln~aAsi
6232,114,3,إِلَٰهِ النَّاسِ,<ila`hi Aln~aAsi
6233,114,4,مِنْ شَرِّ الْوَسْوَاسِ الْخَنَّاسِ,mino $ar~i AlowasowaAsi Aloxan~aAsi
6234,114,5,الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ,Al~a*iy yuwasowisu fiy Suduwri Aln~aAsi


In [107]:
#df_tanzil_sent_nav.to_excel(f"Arabic_Buckwalter_Sent_Tanzil_{type_file.lower().capitalize()}.xlsx", sheet_name=f"Arabic_Buckwalter_{type_file.lower().capitalize()}", index=False, encoding="utf-8")

In [108]:
# Word Navigation
nav_list = []
for i in range(len(df_tanzil_sent_nav)):
    num_1 = df_tanzil_sent_nav.iloc[i,0]
    num_2 = df_tanzil_sent_nav.iloc[i,1]
    token = camel_tools.tokenizers.word.simple_word_tokenize(df_tanzil_sent_nav.iloc[i,2])
    for j in range(len(token)):
        num_3 = j+1
        word = token[j]
        nav_list.append([num_1,num_2,num_3,word]) 

In [109]:
df_tanzil_word_nav = pd.DataFrame(nav_list)
df_tanzil_word_nav.rename(columns={0:"num_1",1:"num_2",2:"num_3",3:f"arabic_word_tanzil_{type_file.lower()}"}, inplace=True)
df_tanzil_word_nav

Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_plain
0,1,1,1,بِسْمِ
1,1,1,2,اللَّهِ
2,1,1,3,الرَّحْمَٰنِ
3,1,1,4,الرَّحِيمِ
4,1,2,1,الْحَمْدُ
...,...,...,...,...
78243,114,5,4,صُدُورِ
78244,114,5,5,النَّاسِ
78245,114,6,1,مِنَ
78246,114,6,2,الْجِنَّةِ


In [110]:
df_tanzil_word_nav[f"buckwalter_word_tanzil_{type_file.lower()}"] = df_tanzil_word_nav.iloc[:,3].apply(lambda x: transString(x, 0))
df_tanzil_word_nav

Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_plain,buckwalter_word_tanzil_plain
0,1,1,1,بِسْمِ,bisomi
1,1,1,2,اللَّهِ,All~ahi
2,1,1,3,الرَّحْمَٰنِ,Alr~aHoma`ni
3,1,1,4,الرَّحِيمِ,Alr~aHiymi
4,1,2,1,الْحَمْدُ,AloHamodu
...,...,...,...,...,...
78243,114,5,4,صُدُورِ,Suduwri
78244,114,5,5,النَّاسِ,Aln~aAsi
78245,114,6,1,مِنَ,mina
78246,114,6,2,الْجِنَّةِ,Alojin~api


In [111]:
df_tanzil_word_nav.to_excel(f"Arabic_Buckwalter_Word_Tanzil_{type_file.lower().capitalize()}.xlsx", sheet_name=f"Arabic_Buckwalter_{type_file.lower().capitalize()}", index=False, encoding="utf-8")

#### Frequency

In [89]:
df_tanzil_word_freq = pd.DataFrame(df_tanzil_word_nav.iloc[:,3].value_counts(ascending=False))
df_tanzil_word_freq = df_tanzil_word_freq.reset_index()
df_tanzil_word_freq.rename(columns={"index":f"arabic_word_tanzil_{type_file.lower()}",f"arabic_word_tanzil_{type_file.lower()}":"frequency"}, inplace=True)
df_tanzil_word_freq

Unnamed: 0,arabic_word_tanzil_clean,frequency
0,من,2763
1,الله,2265
2,في,1185
3,ما,1013
4,إن,966
...,...,...
14863,مرفقا,1
14864,ويهيئ,1
14865,ينشر,1
14866,فأووا,1


In [90]:
#df_tanzil_word_freq.frequency.sum()

In [91]:
df_tanzil_word_freq[f"buckwalter_word_tanzil_{type_file.lower()}"] = df_tanzil_word_freq.iloc[:,0].apply(lambda x: transString(x, 0))
df_tanzil_word_freq = df_tanzil_word_freq.iloc[:,[0,2,1]]
df_tanzil_word_freq

Unnamed: 0,arabic_word_tanzil_clean,buckwalter_word_tanzil_clean,frequency
0,من,mn,2763
1,الله,Allh,2265
2,في,fy,1185
3,ما,mA,1013
4,إن,<n,966
...,...,...,...
14863,مرفقا,mrfqA,1
14864,ويهيئ,wyhy},1
14865,ينشر,yn$r,1
14866,فأووا,f>wwA,1


In [92]:
select_num = 100

In [93]:
df_tanzil_word_select = df_tanzil_word_freq.head(select_num)
df_tanzil_word_select

Unnamed: 0,arabic_word_tanzil_clean,buckwalter_word_tanzil_clean,frequency
0,من,mn,2763
1,الله,Allh,2265
2,في,fy,1185
3,ما,mA,1013
4,إن,<n,966
...,...,...,...
95,عليم,Elym,106
96,ربنا,rbnA,106
97,ربكم,rbkm,102
98,النار,AlnAr,102


In [94]:
df_tanzil_word_select.frequency.sum()

29643

In [20]:
#df_tanzil_word_select.to_excel(f"Arabic_Buckwalter_{select_num}_Word_Frequency_Tanzil_{type_file.lower().capitalize()}.xlsx", sheet_name="Arabic_Buckwalter_Frequency", index=False)

In [95]:
df_tanzil_word_select_merge_nav = pd.merge(df_tanzil_word_select, df_tanzil_word_nav, how="inner", on=f"arabic_word_tanzil_{type_file.lower()}")
df_tanzil_word_select_merge_nav

Unnamed: 0,arabic_word_tanzil_clean,buckwalter_word_tanzil_clean_x,frequency,num_1,num_2,num_3,buckwalter_word_tanzil_clean_y
0,من,mn,2763,2,4,8,mn
1,من,mn,2763,2,5,4,mn
2,من,mn,2763,2,8,3,mn
3,من,mn,2763,2,19,3,mn
4,من,mn,2763,2,19,13,mn
...,...,...,...,...,...,...,...
29638,فلما,flmA,101,61,6,26,flmA
29639,فلما,flmA,101,66,3,8,flmA
29640,فلما,flmA,101,66,3,19,flmA
29641,فلما,flmA,101,67,27,1,flmA


In [96]:
df_tanzil_word_select_merge_nav = df_tanzil_word_select_merge_nav[["num_1","num_2","num_3",f"arabic_word_tanzil_{type_file.lower()}",f"buckwalter_word_tanzil_{type_file.lower()}_x","frequency"]]
df_tanzil_word_select_merge_nav.rename(columns={f"buckwalter_word_tanzil_{type_file.lower()}_x":f"buckwalter_word_tanzil_{type_file.lower()}"}, inplace=True)
df_tanzil_word_select_merge_nav

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_clean,buckwalter_word_tanzil_clean,frequency
0,2,4,8,من,mn,2763
1,2,5,4,من,mn,2763
2,2,8,3,من,mn,2763
3,2,19,3,من,mn,2763
4,2,19,13,من,mn,2763
...,...,...,...,...,...,...
29638,61,6,26,فلما,flmA,101
29639,66,3,8,فلما,flmA,101
29640,66,3,19,فلما,flmA,101
29641,67,27,1,فلما,flmA,101


In [97]:
df_tanzil_word_select_merge_nav["nav"] = df_tanzil_word_select_merge_nav["num_1"].astype("str")+":"+df_tanzil_word_select_merge_nav["num_2"].astype("str")+":"+df_tanzil_word_select_merge_nav["num_3"].astype("str")
df_tanzil_word_select_merge_nav

Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_clean,buckwalter_word_tanzil_clean,frequency,nav
0,2,4,8,من,mn,2763,2:4:8
1,2,5,4,من,mn,2763,2:5:4
2,2,8,3,من,mn,2763,2:8:3
3,2,19,3,من,mn,2763,2:19:3
4,2,19,13,من,mn,2763,2:19:13
...,...,...,...,...,...,...,...
29638,61,6,26,فلما,flmA,101,61:6:26
29639,66,3,8,فلما,flmA,101,66:3:8
29640,66,3,19,فلما,flmA,101,66:3:19
29641,67,27,1,فلما,flmA,101,67:27:1


In [24]:
#df_tanzil_word_select_merge_nav.to_excel(f"Arabic_Buckwalter_{type_file.lower().capitalize()}_Navigation_Tanzil.xlsx", sheet_name=f"Arabic_Buckwalter_{type_file.lower().capitalize()}_Navigation", index=False)

In [98]:
df_tanzil_word_simple_nav = df_tanzil_word_select_merge_nav.groupby([f"arabic_word_tanzil_{type_file.lower()}",f"buckwalter_word_tanzil_{type_file.lower()}","frequency"])["nav"].apply(", ".join).reset_index().sort_values(by="frequency", ascending=False)
df_tanzil_word_simple_nav.reset_index(drop=True, inplace=True)
df_tanzil_word_simple_nav

Unnamed: 0,arabic_word_tanzil_clean,buckwalter_word_tanzil_clean,frequency,nav
0,من,mn,2763,"2:4:8, 2:5:4, 2:8:3, 2:19:3, 2:19:13, 2:21:9, ..."
1,الله,Allh,2265,"1:1:2, 2:1:2, 2:7:2, 2:9:2, 2:10:5, 2:15:1, 2:..."
2,في,fy,1185,"2:10:1, 2:11:6, 2:15:5, 2:17:14, 2:19:11, 2:23..."
3,ما,mA,1013,"2:17:8, 2:26:8, 2:27:9, 2:29:5, 2:30:26, 2:32:..."
4,إن,<n,966,"2:6:1, 2:20:20, 2:23:18, 2:26:1, 2:31:13, 2:62..."
...,...,...,...,...
95,ربنا,rbnA,106,"2:127:8, 2:128:1, 2:129:1, 2:139:6, 2:200:15, ..."
96,عليم,Elym,106,"2:29:19, 2:95:8, 2:115:12, 2:158:24, 2:181:14,..."
97,النار,AlnAr,102,"2:24:7, 2:39:7, 2:80:4, 2:81:10, 2:126:28, 2:1..."
98,ربكم,rbkm,102,"2:21:5, 2:49:17, 2:76:21, 2:105:16, 2:178:30, ..."


In [99]:
df_tanzil_word_simple_nav.to_excel(f"Arabic_Buckwalter_{type_file.lower().capitalize()}_Freq_And_Nav_Tanzil.xlsx", sheet_name=f"Arabic_Buckwalter_{type_file.lower().capitalize()}_Freq", index=False)