### Quaran Tanzil Analysis

In [37]:
import pandas as pd
import numpy as np
import re
import camel_tools.tokenizers.word
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [38]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quran Simple Clean.txt", "r", encoding="utf8") as file: # Quran Simple Clean.txt from Tanzil website
    arabic_text_simple =file.readlines()

In [39]:
df_tanzil_sent = pd.DataFrame(arabic_text_simple)
df_tanzil_sent = df_tanzil_sent.iloc[0:6236,]
df_tanzil_sent

Unnamed: 0,0
0,1|1|بسم الله الرحمن الرحيم\n
1,1|2|الحمد لله رب العالمين\n
2,1|3|الرحمن الرحيم\n
3,1|4|مالك يوم الدين\n
4,1|5|إياك نعبد وإياك نستعين\n
...,...
6231,114|2|ملك الناس\n
6232,114|3|إله الناس\n
6233,114|4|من شر الوسواس الخناس\n
6234,114|5|الذي يوسوس في صدور الناس\n


In [40]:
df_tanzil_sent = pd.DataFrame(df_tanzil_sent.iloc[:,0].apply(lambda x: x.strip("\n")))
df_tanzil_sent

Unnamed: 0,0
0,1|1|بسم الله الرحمن الرحيم
1,1|2|الحمد لله رب العالمين
2,1|3|الرحمن الرحيم
3,1|4|مالك يوم الدين
4,1|5|إياك نعبد وإياك نستعين
...,...
6231,114|2|ملك الناس
6232,114|3|إله الناس
6233,114|4|من شر الوسواس الخناس
6234,114|5|الذي يوسوس في صدور الناس


In [41]:
df_tanzil_sent_nav = df_tanzil_sent.iloc[:,0].str.split("|", n=-1, expand=True)
df_tanzil_sent_nav.rename(columns={0:"num_1",1:"num_2",2:"arabic_sent_tanzil_simple"}, inplace=True)
df_tanzil_sent_nav

Unnamed: 0,num_1,num_2,arabic_sent_tanzil_simple
0,1,1,بسم الله الرحمن الرحيم
1,1,2,الحمد لله رب العالمين
2,1,3,الرحمن الرحيم
3,1,4,مالك يوم الدين
4,1,5,إياك نعبد وإياك نستعين
...,...,...,...
6231,114,2,ملك الناس
6232,114,3,إله الناس
6233,114,4,من شر الوسواس الخناس
6234,114,5,الذي يوسوس في صدور الناس


In [42]:
df_tanzil_sent_nav["buckwalter_sent_tanzil_simple"] = df_tanzil_sent_nav.iloc[:,2].apply(lambda x: transString(x, 0))
df_tanzil_sent_nav

Unnamed: 0,num_1,num_2,arabic_sent_tanzil_simple,buckwalter_sent_tanzil_simple
0,1,1,بسم الله الرحمن الرحيم,bsm Allh AlrHmn AlrHym
1,1,2,الحمد لله رب العالمين,AlHmd llh rb AlEAlmyn
2,1,3,الرحمن الرحيم,AlrHmn AlrHym
3,1,4,مالك يوم الدين,mAlk ywm Aldyn
4,1,5,إياك نعبد وإياك نستعين,<yAk nEbd w<yAk nstEyn
...,...,...,...,...
6231,114,2,ملك الناس,mlk AlnAs
6232,114,3,إله الناس,<lh AlnAs
6233,114,4,من شر الوسواس الخناس,mn $r AlwswAs AlxnAs
6234,114,5,الذي يوسوس في صدور الناس,Al*y ywsws fy Sdwr AlnAs


In [43]:
df_tanzil_sent_nav.to_excel("Arabic_And_Buckwalter_Sent_Tanzil_Simple.xlsx", sheet_name="Arabic_And_Buckwalter_Simple", index=False, encoding="utf-8")

In [44]:
nav_list = []
for i in range(len(df_tanzil_sent_nav)):
    num_1 = df_tanzil_sent_nav.iloc[i,0]
    num_2 = df_tanzil_sent_nav.iloc[i,1]
    token = camel_tools.tokenizers.word.simple_word_tokenize(df_tanzil_sent_nav.iloc[i,2])
    for j in range(len(token)):
        num_3 = j+1
        word = token[j]
        nav_list.append([num_1,num_2,num_3,word]) 

In [45]:
df_tanzil_word_nav = pd.DataFrame(nav_list)
df_tanzil_word_nav.rename(columns={0:"num_1",1:"num_2",2:"num_3",3:"arabic_word_tanzil_simple"}, inplace=True)
df_tanzil_word_nav

Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_simple
0,1,1,1,بسم
1,1,1,2,الله
2,1,1,3,الرحمن
3,1,1,4,الرحيم
4,1,2,1,الحمد
...,...,...,...,...
78243,114,5,4,صدور
78244,114,5,5,الناس
78245,114,6,1,من
78246,114,6,2,الجنة


In [46]:
# buckwalter text clean for simple before transString
def clean_ex(text):
    buckwalter_out = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio`~"%-])''', "", text) # difference from clean
    return buckwalter_out

In [47]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [48]:
df_tanzil_word_nav["buckwalter_word_tanzil_simple"] = df_tanzil_word_nav.iloc[:,3].apply(lambda x: transString(x, 0))
df_tanzil_word_nav

Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple
0,1,1,1,بسم,bsm
1,1,1,2,الله,Allh
2,1,1,3,الرحمن,AlrHmn
3,1,1,4,الرحيم,AlrHym
4,1,2,1,الحمد,AlHmd
...,...,...,...,...,...
78243,114,5,4,صدور,Sdwr
78244,114,5,5,الناس,AlnAs
78245,114,6,1,من,mn
78246,114,6,2,الجنة,Aljnp


In [49]:
df_tanzil_word_nav.to_excel("Arabic_And_Buckwalter_Word_Tanzil_Simple.xlsx", sheet_name="Arabic_And_Buckwalter_Simple", index=False, encoding="utf-8")

#### Frequency

In [50]:
df_tanzil_word_freq = pd.DataFrame(df_tanzil_word_nav.iloc[:,3].value_counts(ascending=False))
df_tanzil_word_freq = df_tanzil_word_freq.reset_index()
df_tanzil_word_freq.rename(columns={"index":"arabic_word_tanzil_simple","arabic_word_tanzil_simple":"frequency"}, inplace=True)
df_tanzil_word_freq

Unnamed: 0,arabic_word_tanzil_simple,frequency
0,من,2763
1,الله,2265
2,في,1185
3,ما,1013
4,إن,966
...,...,...
14863,مرفقا,1
14864,ويهيئ,1
14865,ينشر,1
14866,فأووا,1


In [51]:
df_tanzil_word_freq["buckwalter_word_tanzil_simple"] = df_tanzil_word_freq.iloc[:,0].apply(lambda x: transString(x, 0))
df_tanzil_word_freq = df_tanzil_word_freq.iloc[:,[0,2,1]]
df_tanzil_word_freq

Unnamed: 0,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple,frequency
0,من,mn,2763
1,الله,Allh,2265
2,في,fy,1185
3,ما,mA,1013
4,إن,<n,966
...,...,...,...
14863,مرفقا,mrfqA,1
14864,ويهيئ,wyhy},1
14865,ينشر,yn$r,1
14866,فأووا,f>wwA,1


In [52]:
df_tanzil_word_select = df_tanzil_word_freq.head(1000)
df_tanzil_word_select

Unnamed: 0,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple,frequency
0,من,mn,2763
1,الله,Allh,2265
2,في,fy,1185
3,ما,mA,1013
4,إن,<n,966
...,...,...,...
995,متى,mtY,9
996,قريبا,qrybA,9
997,مسلمين,mslmyn,9
998,بربهم,brbhm,9


In [53]:
df_tanzil_word_select_merge_nav = pd.merge(df_tanzil_word_select, df_tanzil_word_nav, how="inner", on="arabic_word_tanzil_simple")
df_tanzil_word_select_merge_nav

Unnamed: 0,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple_x,frequency,num_1,num_2,num_3,buckwalter_word_tanzil_simple_y
0,من,mn,2763,2,4,8,mn
1,من,mn,2763,2,5,4,mn
2,من,mn,2763,2,8,3,mn
3,من,mn,2763,2,19,3,mn
4,من,mn,2763,2,19,13,mn
...,...,...,...,...,...,...,...
52108,لأبيه,l>byh,9,21,52,3,l>byh
52109,لأبيه,l>byh,9,26,70,3,l>byh
52110,لأبيه,l>byh,9,37,85,3,l>byh
52111,لأبيه,l>byh,9,43,26,4,l>byh


In [54]:
df_tanzil_word_select_merge_nav = df_tanzil_word_select_merge_nav[["num_1","num_2","num_3","arabic_word_tanzil_simple","buckwalter_word_tanzil_simple_x","frequency"]]
df_tanzil_word_select_merge_nav.rename(columns={"buckwalter_word_tanzil_simple_x":"buckwalter_word_tanzil_simple"}, inplace=True)
df_tanzil_word_select_merge_nav

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple,frequency
0,2,4,8,من,mn,2763
1,2,5,4,من,mn,2763
2,2,8,3,من,mn,2763
3,2,19,3,من,mn,2763
4,2,19,13,من,mn,2763
...,...,...,...,...,...,...
52108,21,52,3,لأبيه,l>byh,9
52109,26,70,3,لأبيه,l>byh,9
52110,37,85,3,لأبيه,l>byh,9
52111,43,26,4,لأبيه,l>byh,9


In [55]:
df_tanzil_word_select_merge_nav["nav"] = df_tanzil_word_select_merge_nav["num_1"].astype("str")+":"+df_tanzil_word_select_merge_nav["num_2"].astype("str")+":"+df_tanzil_word_select_merge_nav["num_3"].astype("str")
df_tanzil_word_select_merge_nav

Unnamed: 0,num_1,num_2,num_3,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple,frequency,nav
0,2,4,8,من,mn,2763,2:4:8
1,2,5,4,من,mn,2763,2:5:4
2,2,8,3,من,mn,2763,2:8:3
3,2,19,3,من,mn,2763,2:19:3
4,2,19,13,من,mn,2763,2:19:13
...,...,...,...,...,...,...,...
52108,21,52,3,لأبيه,l>byh,9,21:52:3
52109,26,70,3,لأبيه,l>byh,9,26:70:3
52110,37,85,3,لأبيه,l>byh,9,37:85:3
52111,43,26,4,لأبيه,l>byh,9,43:26:4


In [58]:
df_tanzil_word_select_merge_nav.to_excel("Arabic_Simple_Navigation_Tanzil.xlsx", sheet_name="Arabic_Simple_Navigation", index=False)

In [56]:
df_tanzil_word_simple_nav = df_tanzil_word_select_merge_nav.groupby(["arabic_word_tanzil_simple","buckwalter_word_tanzil_simple","frequency"])["nav"].apply(", ".join).reset_index().sort_values(by="frequency", ascending=False)
df_tanzil_word_simple_nav.reset_index(drop=True, inplace=True)
df_tanzil_word_simple_nav

Unnamed: 0,arabic_word_tanzil_simple,buckwalter_word_tanzil_simple,frequency,nav
0,من,mn,2763,"2:4:8, 2:5:4, 2:8:3, 2:19:3, 2:19:13, 2:21:9, ..."
1,الله,Allh,2265,"1:1:2, 2:1:2, 2:7:2, 2:9:2, 2:10:5, 2:15:1, 2:..."
2,في,fy,1185,"2:10:1, 2:11:6, 2:15:5, 2:17:14, 2:19:11, 2:23..."
3,ما,mA,1013,"2:17:8, 2:26:8, 2:27:9, 2:29:5, 2:30:26, 2:32:..."
4,إن,<n,966,"2:6:1, 2:20:20, 2:23:18, 2:26:1, 2:31:13, 2:62..."
...,...,...,...,...
995,لأبيه,l>byh,9,"6:74:4, 9:114:5, 12:4:4, 19:42:3, 21:52:3, 26:..."
996,تخرج,txrj,9,"5:110:45, 18:5:10, 20:22:5, 23:20:2, 27:12:5, ..."
997,متى,mtY,9,"2:214:23, 10:48:2, 17:51:19, 21:38:2, 27:71:2,..."
998,بالباطل,bAlbATl,9,"2:42:4, 2:188:5, 3:71:7, 4:29:9, 4:161:9, 9:34..."


In [57]:
df_tanzil_word_simple_nav.to_excel("Arabic_Simple_Frequency_Navigation_Tanzil.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)