### Quran Form Analysis

In [4]:
#!pip install lang-trans
#!pip install camel-tools

In [5]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter
import nltk
from nltk import word_tokenize
from nltk import ngrams

In [6]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text = file.read()

#### Form Tag

In [288]:
form_tag_list = re.findall("[\(\d*:\d*:\d*:\d*\)][\t](\D*)[\t][\D*]", text)

In [289]:
form_tag_num = re.findall("(\d*:\d*:\d*:\d*)", text)

In [290]:
df_stem = pd.DataFrame(form_tag_num)
df_stem.rename(columns={0:"loc_num"}, inplace=True)
df_stem

Unnamed: 0,loc_num
0,1:1:1:1
1,1:1:1:2
2,1:1:2:1
3,1:1:3:1
4,1:1:3:2
...,...
128214,114:6:2:1
128215,114:6:2:2
128216,114:6:3:1
128217,114:6:3:2


In [291]:
df_stem = df_stem["loc_num"].str.split(":", n=-1, expand=True)
df_stem.rename(columns={0:"num_1", 1:"num_2", 2:"num_3", 3:"num_4"}, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4
0,1,1,1,1
1,1,1,1,2
2,1,1,2,1
3,1,1,3,1
4,1,1,3,2
...,...,...,...,...
128214,114,6,2,1
128215,114,6,2,2
128216,114,6,3,1
128217,114,6,3,2


In [292]:
df_stem['num_1'] = df_stem['num_1'].astype(int)
df_stem['num_2'] = df_stem['num_2'].astype(int)
df_stem['num_3'] = df_stem['num_3'].astype(int)
df_stem['num_4'] = df_stem['num_4'].astype(int)

In [293]:
df_stem["form_tag"] = pd.DataFrame(form_tag_list)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form_tag
0,1,1,1,1,bi\tP
1,1,1,1,2,somi\tN
2,1,1,2,1,{ll~ahi\tPN
3,1,1,3,1,{l\tDET
4,1,1,3,2,r~aHoma`ni\tADJ
...,...,...,...,...,...
128214,114,6,2,1,{lo\tDET
128215,114,6,2,2,jin~api\tN
128216,114,6,3,1,wa\tCONJ
128217,114,6,3,2,{l\tDET


In [294]:
df_stem[["form","tag"]] = df_stem["form_tag"].str.split("\t", n=-1, expand=True)
df_stem.drop(["form_tag"], axis=1, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [295]:
df_stem.drop_duplicates(inplace=True)
df_stem.reset_index(drop=True, inplace=True)
df_stem

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag
0,1,1,1,1,bi,P
1,1,1,1,2,somi,N
2,1,1,2,1,{ll~ahi,PN
3,1,1,3,1,{l,DET
4,1,1,3,2,r~aHoma`ni,ADJ
...,...,...,...,...,...,...
128214,114,6,2,1,{lo,DET
128215,114,6,2,2,jin~api,N
128216,114,6,3,1,wa,CONJ
128217,114,6,3,2,{l,DET


In [296]:
#df_stem.to_excel("Buckwalter_Form_Tag.xlsx", sheet_name="Form_Tag", index=False)

In [297]:
df_word = pd.DataFrame(df_stem.groupby(["num_1","num_2","num_3"])["form"].sum())
df_word.reset_index(inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [298]:
#df_word.to_excel("Buckwalter_Form_Concat.xlsx", sheet_name="Concat_Form", index=False)

#### Buckwalter Text

In [299]:
#df_word_concat = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [300]:
df_word

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [301]:
concat_list = df_word.iloc[:,3].to_list()
#concat_list = df_word_concat.iloc[:,3].head(500).to_list()

In [302]:
buckwalter_text = " ".join(concat_list)
#buckwalter_text

#### Arabic Text

##### Camel Tools

In [7]:
bw2ar = CharMapper.builtin_mapper('bw2ar')

In [304]:
arabic_text = bw2ar(buckwalter_text)
#arabic_text

In [305]:
#with open("Arabic_With_Extended_Characters.txt", "w", encoding="utf8") as file:
#    file.write(arabic_text)

In [306]:
arabic_text2 = re.sub(r"\[", "", arabic_text)

In [307]:
arabic_text3 = re.sub(r"\]", "", arabic_text2)
#arabic_text3

In [308]:
arabic_text4 = re.sub(r'''([@#:;,.!-+%"])''', "", arabic_text3)
#arabic_text4

In [309]:
arabic_text5 = re.sub(r"\^", " ", arabic_text4)
#arabic_text5

In [310]:
#with open("Arabic_Without_Extended_Characters.txt", "w", encoding="utf8") as file:
#    file.write(arabic_text5)

##### Def Func

In [10]:
# 2-Way Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [312]:
# Simple Technique
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "{": u"\u0671", # waSla         
}

def transStringSimple(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2unisimple.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [313]:
def clean(text):
    arabic_text2 = re.sub(r"\[", "", text)
    arabic_text3 = re.sub(r"\]", "", arabic_text2)
    arabic_text4 = re.sub(r'''([@#:;,.!-+%"FNKaui~o`}PJVG])''', "", arabic_text3)
    arabic_text5 = re.sub(r"\^", " ", arabic_text4)
    return arabic_text5

In [314]:
## Simple Technique2
## -*- coding: utf-8 -*-
#
## Arabic Transliteration based on Buckwalter
## dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 
#
#buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
#            "|": u"\u0627", # madda \u0622
#            ">": u"\u0627", # hamza-on-'alif \u0623
#            "&": u"\u0648", # hamza-on-waaw u"\u0624
#            "<": u"\u0627", # hamza-under-'alif \u0625
#            "A": u"\u0627", # bare 'alif
#            "b": u"\u0628", # baa'
#            "p": u"\u0629", # taa' marbuuTa
#            "t": u"\u062A", # taa'
#            "v": u"\u062B", # thaa'
#            "j": u"\u062C", # jiim
#            "H": u"\u062D", # Haa'
#            "x": u"\u062E", # khaa'
#            "d": u"\u062F", # daal
#            "*": u"\u0630", # dhaal
#            "r": u"\u0631", # raa'
#            "z": u"\u0632", # zaay
#            "s": u"\u0633", # siin
#            "$": u"\u0634", # shiin
#            "S": u"\u0635", # Saad
#            "D": u"\u0636", # Daad
#            "T": u"\u0637", # Taa'
#            "Z": u"\u0638", # Zaa' (DHaa')
#            "E": u"\u0639", # cayn
#            "g": u"\u063A", # ghayn
#            "_": u"\u0640", # taTwiil
#            "f": u"\u0641", # faa'
#            "q": u"\u0642", # qaaf
#            "k": u"\u0643", # kaaf
#            "l": u"\u0644", # laam
#            "m": u"\u0645", # miim
#            "n": u"\u0646", # nuun
#            "h": u"\u0647", # haa'
#            "w": u"\u0648", # waaw
#            "Y": u"\u0649", # 'alif maqSuura
#            "y": u"\u064A", # yaa'
#            "{": u"\u0627", # waSla \u0671         
#}
#
#def transStringSimple2(string, reverse=0):
#    '''Given a Unicode string, transliterate into Buckwalter. To go from
#    Buckwalter back to Unicode, set reverse=1'''
#
#    for k, v in buck2unisimple.items():
#      if not reverse:
#            string = string.replace(v, k)
#      else:
#            string = string.replace(k, v)
#
#    return string

In [315]:
#def clean2(text):
#    arabic_text2 = re.sub(r"\[", "", text)
#    arabic_text3 = re.sub(r"\]", "", arabic_text2)
#    arabic_text4 = re.sub(r'''([@#:;,.!-+%"FNKaui~o`}PJVG])''', "", arabic_text3)
#    arabic_text5 = re.sub(r"\^", " ", arabic_text4)
#    return arabic_text5

In [316]:
arabic_func_text = transString(buckwalter_text, 1)
#arabic_func_text

In [317]:
#with open("Arabic_Converted_Extended_Characters.txt", "w", encoding="utf8") as file:
#    file.write(arabic_func_text)

##### DF Word Concat Form Convert Arabic

In [318]:
#df_word_concat = pd.read_excel("Buckwalter_Form_Concat.xlsx")

In [319]:
df_word

Unnamed: 0,num_1,num_2,num_3,form
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [320]:
df_word["arabic"] = df_word.iloc[:,3].apply(lambda x: transString(x, 1))
df_word

Unnamed: 0,num_1,num_2,num_3,form,arabic
0,1,1,1,bisomi,بِسْمِ
1,1,1,2,{ll~ahi,ٱللَّهِ
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ
4,1,2,1,{loHamodu,ٱلْحَمْدُ
...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ
77426,114,6,1,mina,مِنَ
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ


In [321]:
#df_word["arabic"] = df_word.iloc[:,3].apply(lambda x: bw2ar(x))
#df_word

In [322]:
df_word["arabic_simple"] = df_word.iloc[:,3].apply(lambda x : transStringSimple(x, 1))
df_word["arabic_simple"] = df_word.iloc[:,5].apply(lambda x : clean(x))
df_word

Unnamed: 0,num_1,num_2,num_3,form,arabic,arabic_simple
0,1,1,1,bisomi,بِسْمِ,بسم
1,1,1,2,{ll~ahi,ٱللَّهِ,ٱلله
2,1,1,3,{lr~aHoma`ni,ٱلرَّحْمَٰنِ,ٱلرحمن
3,1,1,4,{lr~aHiymi,ٱلرَّحِيمِ,ٱلرحيم
4,1,2,1,{loHamodu,ٱلْحَمْدُ,ٱلحمد
...,...,...,...,...,...,...
77424,114,5,4,Suduwri,صُدُورِ,صدور
77425,114,5,5,{ln~aAsi,ٱلنَّاسِ,ٱلناس
77426,114,6,1,mina,مِنَ,من
77427,114,6,2,{lojin~api,ٱلْجِنَّةِ,ٱلجنة


In [323]:
df_word = df_word[["num_1","num_2","num_3","arabic_simple","arabic","form"]]
df_word.rename(columns={"form":"buckwalter"}, inplace=True)
df_word

Unnamed: 0,num_1,num_2,num_3,arabic_simple,arabic,buckwalter
0,1,1,1,بسم,بِسْمِ,bisomi
1,1,1,2,ٱلله,ٱللَّهِ,{ll~ahi
2,1,1,3,ٱلرحمن,ٱلرَّحْمَٰنِ,{lr~aHoma`ni
3,1,1,4,ٱلرحيم,ٱلرَّحِيمِ,{lr~aHiymi
4,1,2,1,ٱلحمد,ٱلْحَمْدُ,{loHamodu
...,...,...,...,...,...,...
77424,114,5,4,صدور,صُدُورِ,Suduwri
77425,114,5,5,ٱلناس,ٱلنَّاسِ,{ln~aAsi
77426,114,6,1,من,مِنَ,mina
77427,114,6,2,ٱلجنة,ٱلْجِنَّةِ,{lojin~api


In [333]:
df_word.iloc[27,:]

num_1                1
num_2                7
num_3                8
arabic_simple      ولا
arabic           وَلَا
buckwalter       walaA
Name: 27, dtype: object

In [334]:
#df_word.to_excel("Arabic_Simple_Word.xlsx", sheet_name="Form_Arabic_Simple", index=False, encoding="utf-8")

In [335]:
#df_word.to_csv("Arabic_Simple_Word.csv", index=False, encoding="utf-8")

In [233]:
df_word_count = pd.DataFrame(df_word.iloc[:,4].value_counts(ascending=False))
df_word_count.reset_index(inplace=True)
df_word_count.rename(columns={"index":"arabic","arabic":"frequency"}, inplace=True)
df_word_count

Unnamed: 0,arabic,frequency
0,فِى,1098
1,ٱللَّهِ,828
2,ٱلَّذِينَ,810
3,ٱللَّهُ,733
4,مِن,728
...,...,...
18988,يُنكِرُ,1
18989,أُشْرِكَ,1
18990,مَـَٔابِ,1
18991,وَذُرِّيَّةً,1


In [234]:
#df_word_count.to_excel("Arabic_Word_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Frequency", index=False)

In [235]:
df_simple_word_count = pd.DataFrame(df_word.iloc[:,3].value_counts(ascending=False))
df_simple_word_count.reset_index(inplace=True)
df_simple_word_count.rename(columns={"index":"arabic_simple","arabic_simple":"frequency"}, inplace=True)
df_simple_word_count

Unnamed: 0,arabic_simple,frequency
0,من,2763
1,ٱلله,2153
2,فى,1099
3,إن,966
4,ما,873
...,...,...
15280,ٱلفتية,1
15281,وهي,1
15282,فضربنا,1
15283,حولك,1


In [236]:
#df_simple_word_count.to_excel("Arabic_Simple_Word_Frequency_From_Morphology.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)

##### Camal Tools Word Tokenize 

In [237]:
import camel_tools.tokenizers.word

In [238]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quran/Data/Quran Simple Clean.txt", "r", encoding="utf8") as file:
    arabic_text_simple =file.read()

In [239]:
#arabic_text_simple

In [240]:
#re.findall("(\d*\|\d*\|\D*)", arabic_text_simple)
arabic_simple_list = re.findall("[\d*\d*][\|](\D*)[\n]", arabic_text_simple)
arabic_simple_list

['بسم الله الرحمن الرحيم',
 'الحمد لله رب العالمين',
 'الرحمن الرحيم',
 'مالك يوم الدين',
 'إياك نعبد وإياك نستعين',
 'اهدنا الصراط المستقيم',
 'صراط الذين أنعمت عليهم غير المغضوب عليهم ولا الضالين',
 'بسم الله الرحمن الرحيم الم',
 'ذلك الكتاب لا ريب فيه هدى للمتقين',
 'الذين يؤمنون بالغيب ويقيمون الصلاة ومما رزقناهم ينفقون',
 'والذين يؤمنون بما أنزل إليك وما أنزل من قبلك وبالآخرة هم يوقنون',
 'أولئك على هدى من ربهم وأولئك هم المفلحون',
 'إن الذين كفروا سواء عليهم أأنذرتهم أم لم تنذرهم لا يؤمنون',
 'ختم الله على قلوبهم وعلى سمعهم وعلى أبصارهم غشاوة ولهم عذاب عظيم',
 'ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين',
 'يخادعون الله والذين آمنوا وما يخدعون إلا أنفسهم وما يشعرون',
 'في قلوبهم مرض فزادهم الله مرضا ولهم عذاب أليم بما كانوا يكذبون',
 'وإذا قيل لهم لا تفسدوا في الأرض قالوا إنما نحن مصلحون',
 'ألا إنهم هم المفسدون ولكن لا يشعرون',
 'وإذا قيل لهم آمنوا كما آمن الناس قالوا أنؤمن كما آمن السفهاء ألا إنهم هم السفهاء ولكن لا يعلمون',
 'وإذا لقوا الذين آمنوا قالوا آمنا وإذ

In [241]:
ar2bw = CharMapper.builtin_mapper('ar2bw')

In [242]:
buckwalter_simple_list = []
for i in arabic_simple_list:
    buckwalter_word = ar2bw(i)
    buckwalter_simple_list.append(buckwalter_word)

In [243]:
buckwalter_simple_list

['bsm Allh AlrHmn AlrHym',
 'AlHmd llh rb AlEAlmyn',
 'AlrHmn AlrHym',
 'mAlk ywm Aldyn',
 '<yAk nEbd w<yAk nstEyn',
 'AhdnA AlSrAT Almstqym',
 'SrAT Al*yn >nEmt Elyhm gyr AlmgDwb Elyhm wlA AlDAlyn',
 'bsm Allh AlrHmn AlrHym Alm',
 '*lk AlktAb lA ryb fyh hdY llmtqyn',
 'Al*yn y&mnwn bAlgyb wyqymwn AlSlAp wmmA rzqnAhm ynfqwn',
 'wAl*yn y&mnwn bmA >nzl <lyk wmA >nzl mn qblk wbAl|xrp hm ywqnwn',
 '>wl}k ElY hdY mn rbhm w>wl}k hm AlmflHwn',
 "<n Al*yn kfrwA swA' Elyhm >>n*rthm >m lm tn*rhm lA y&mnwn",
 'xtm Allh ElY qlwbhm wElY smEhm wElY >bSArhm g$Awp wlhm E*Ab EZym',
 'wmn AlnAs mn yqwl |mnA bAllh wbAlywm Al|xr wmA hm bm&mnyn',
 'yxAdEwn Allh wAl*yn |mnwA wmA yxdEwn <lA >nfshm wmA y$Erwn',
 'fy qlwbhm mrD fzAdhm Allh mrDA wlhm E*Ab >lym bmA kAnwA yk*bwn',
 'w<*A qyl lhm lA tfsdwA fy Al>rD qAlwA <nmA nHn mSlHwn',
 '>lA <nhm hm Almfsdwn wlkn lA y$Erwn',
 "w<*A qyl lhm |mnwA kmA |mn AlnAs qAlwA >n&mn kmA |mn AlsfhA' >lA <nhm hm AlsfhA' wlkn lA yElmwn",
 'w<*A lqwA Al*yn |mnwA qAlwA |mnA w<*

In [244]:
buckwalter_simple_text = " ".join(buckwalter_simple_list)

In [245]:
#with open("Buckwalter_Simple_Text.txt", "w", encoding="utf8") as file:
#    file.write(buckwalter_simple_text)

In [246]:
bw2ar = CharMapper.builtin_mapper('bw2ar')

In [247]:
arabic_simple_text = bw2ar(buckwalter_simple_text)
#arabic_simple_text

In [None]:
#arabic_simple_text2=transString(buckwalter_simple_text, 1)

In [None]:
#with open("Arabic_Simple_Text.txt", "w", encoding="utf8") as file:
#    file.write(arabic_simple_text)

In [248]:
simple_words = camel_tools.tokenizers.word.simple_word_tokenize(arabic_simple_text)

In [249]:
df_simple = pd.DataFrame(simple_words, columns=["arabic_simple"])
df_simple

Unnamed: 0,arabic_simple
0,بسم
1,الله
2,الرحمن
3,الرحيم
4,الحمد
...,...
78323,=
78324,=
78325,=
78326,=


In [250]:
df_simple_count = pd.DataFrame(df_simple.iloc[:,0].value_counts(ascending=False))
df_simple_count.reset_index(inplace=True)
df_simple_count.rename(columns={"index":"arabic_simple","arabic_simple":"frequency"}, inplace=True)
df_simple_count

Unnamed: 0,arabic_simple,frequency
0,من,2763
1,الله,2265
2,في,1185
3,ما,1013
4,إن,966
...,...,...
14874,غربت,1
14875,وشاورهم,1
14876,تزاور,1
14877,طلعت,1


In [None]:
#df_simple_count.to_excel("Arabic_Word_Frequency_From_Simple.xlsx", sheet_name="Arabic_Simple_Frequency", index=False)

In [251]:
extended_words = camel_tools.tokenizers.word.simple_word_tokenize(arabic_func_text)

In [252]:
df_extended = pd.DataFrame(extended_words, columns=["arabic_extended"])
df_extended

Unnamed: 0,arabic_extended
0,بِسْمِ
1,ٱللَّهِ
2,ٱلرَّحْمَٰنِ
3,ٱلرَّحِيمِ
4,ٱلْحَمْدُ
...,...
77425,صُدُورِ
77426,ٱلنَّاسِ
77427,مِنَ
77428,ٱلْجِنَّةِ


In [253]:
df_extended_count = pd.DataFrame(df_extended.iloc[:,0].value_counts(ascending=False))
df_extended_count.reset_index(inplace=True)
df_extended_count.rename(columns={"index":"arabic_extended","arabic_extended":"frequency"}, inplace=True)
df_extended_count

Unnamed: 0,arabic_extended,frequency
0,فِى,1098
1,ٱللَّهِ,828
2,ٱلَّذِينَ,810
3,ٱللَّهُ,733
4,مِن,728
...,...,...
18989,أَسْمَآءً,1
18990,أَمَّآ,1
18991,أَحَدُكُمَا,1
18992,فَيَسْقِى,1


In [254]:
#df_extended_count.to_excel("Arabic_Word_Frequency_From_Simple.xlsx", sheet_name="Arabic_Extended_Frequency", index=False)

In [255]:
set_extend = set(df_extended_count.iloc[:,0])

In [256]:
set_word = set(df_word_count.iloc[:,0])

In [257]:
set_extend.difference(set_word)

{'إِلْ', 'يَاسِينَ'}

In [258]:
set_word.difference(set_extend)

{'إِلْ يَاسِينَ'}

##### NLTK Word Tokenize

In [259]:
nltk_extended_words = word_tokenize(arabic_func_text)

In [260]:
df_nltk_extended = pd.DataFrame(nltk_extended_words, columns=["arabic_extended"])
df_nltk_extended

Unnamed: 0,arabic_extended
0,بِسْمِ
1,ٱللَّهِ
2,ٱلرَّحْمَٰنِ
3,ٱلرَّحِيمِ
4,ٱلْحَمْدُ
...,...
77425,صُدُورِ
77426,ٱلنَّاسِ
77427,مِنَ
77428,ٱلْجِنَّةِ


In [261]:
df_nltk_extended_count = pd.DataFrame(df_nltk_extended.iloc[:,0].value_counts(ascending=False))
df_nltk_extended_count.reset_index(inplace=True)
df_nltk_extended_count.rename(columns={"index":"arabic_extended","arabic_extended":"frequency"}, inplace=True)
df_nltk_extended_count

Unnamed: 0,arabic_extended,frequency
0,فِى,1098
1,ٱللَّهِ,828
2,ٱلَّذِينَ,810
3,ٱللَّهُ,733
4,مِن,728
...,...,...
18989,أَسْمَآءً,1
18990,أَمَّآ,1
18991,أَحَدُكُمَا,1
18992,فَيَسْقِى,1


##### NLTK N Gram From Text

In [262]:
df_word

Unnamed: 0,num_1,num_2,num_3,arabic_simple,arabic,buckwalter
0,1,1,1,بسم,بِسْمِ,bisomi
1,1,1,2,ٱلله,ٱللَّهِ,{ll~ahi
2,1,1,3,ٱلرحمن,ٱلرَّحْمَٰنِ,{lr~aHoma`ni
3,1,1,4,ٱلرحيم,ٱلرَّحِيمِ,{lr~aHiymi
4,1,2,1,ٱلحمد,ٱلْحَمْدُ,{loHamodu
...,...,...,...,...,...,...
77424,114,5,4,صدور,صُدُورِ,Suduwri
77425,114,5,5,ٱلناس,ٱلنَّاسِ,{ln~aAsi
77426,114,6,1,من,مِنَ,mina
77427,114,6,2,ٱلجنة,ٱلْجِنَّةِ,{lojin~api


In [263]:
arabic_word_list =  df_word.iloc[:,4].to_list() # arabic extended

In [264]:
arabic_word_concat = " ".join(arabic_word_list)
arabic_word_concat

'بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ مَٰلِكِ يَوْمِ ٱلدِّينِ إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ ٱلْمَغْضُوبِ عَلَيْهِمْ وَلَا ٱلضَّآلِّينَ الٓمٓ ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ فِيهِ هُدًى لِّلْمُتَّقِينَ ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ ٱلصَّلَوٰةَ وَمِمَّا رَزَقْنَٰهُمْ يُنفِقُونَ وَٱلَّذِينَ يُؤْمِنُونَ بِمَآ أُنزِلَ إِلَيْكَ وَمَآ أُنزِلَ مِن قَبْلِكَ وَبِٱلْءَاخِرَةِ هُمْ يُوقِنُونَ أُو۟لَٰٓئِكَ عَلَىٰ هُدًى مِّن رَّبِّهِمْ وَأُو۟لَٰٓئِكَ هُمُ ٱلْمُفْلِحُونَ إِنَّ ٱلَّذِينَ كَفَرُوا۟ سَوَآءٌ عَلَيْهِمْ ءَأَنذَرْتَهُمْ أَمْ لَمْ تُنذِرْهُمْ لَا يُؤْمِنُونَ خَتَمَ ٱللَّهُ عَلَىٰ قُلُوبِهِمْ وَعَلَىٰ سَمْعِهِمْ وَعَلَىٰٓ أَبْصَٰرِهِمْ غِشَٰوَةٌ وَلَهُمْ عَذَابٌ عَظِيمٌ وَمِنَ ٱلنَّاسِ مَن يَقُولُ ءَامَنَّا بِٱللَّهِ وَبِٱلْيَوْمِ ٱلْءَاخِرِ وَمَا هُم بِمُؤْمِنِينَ يُخَٰدِعُونَ ٱللَّهَ وَٱلَّذِينَ ءَامَنُوا۟ وَمَا يَخْ

In [265]:
n = 2
n_gram = ngrams(arabic_word_concat.split(), n)
n_gram_list = []
for i in n_gram:
    n_gram_group = " ".join(i)
    n_gram_list.append(n_gram_group)

In [266]:
n_gram_list

['بِسْمِ ٱللَّهِ',
 'ٱللَّهِ ٱلرَّحْمَٰنِ',
 'ٱلرَّحْمَٰنِ ٱلرَّحِيمِ',
 'ٱلرَّحِيمِ ٱلْحَمْدُ',
 'ٱلْحَمْدُ لِلَّهِ',
 'لِلَّهِ رَبِّ',
 'رَبِّ ٱلْعَٰلَمِينَ',
 'ٱلْعَٰلَمِينَ ٱلرَّحْمَٰنِ',
 'ٱلرَّحْمَٰنِ ٱلرَّحِيمِ',
 'ٱلرَّحِيمِ مَٰلِكِ',
 'مَٰلِكِ يَوْمِ',
 'يَوْمِ ٱلدِّينِ',
 'ٱلدِّينِ إِيَّاكَ',
 'إِيَّاكَ نَعْبُدُ',
 'نَعْبُدُ وَإِيَّاكَ',
 'وَإِيَّاكَ نَسْتَعِينُ',
 'نَسْتَعِينُ ٱهْدِنَا',
 'ٱهْدِنَا ٱلصِّرَٰطَ',
 'ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ',
 'ٱلْمُسْتَقِيمَ صِرَٰطَ',
 'صِرَٰطَ ٱلَّذِينَ',
 'ٱلَّذِينَ أَنْعَمْتَ',
 'أَنْعَمْتَ عَلَيْهِمْ',
 'عَلَيْهِمْ غَيْرِ',
 'غَيْرِ ٱلْمَغْضُوبِ',
 'ٱلْمَغْضُوبِ عَلَيْهِمْ',
 'عَلَيْهِمْ وَلَا',
 'وَلَا ٱلضَّآلِّينَ',
 'ٱلضَّآلِّينَ الٓمٓ',
 'الٓمٓ ذَٰلِكَ',
 'ذَٰلِكَ ٱلْكِتَٰبُ',
 'ٱلْكِتَٰبُ لَا',
 'لَا رَيْبَ',
 'رَيْبَ فِيهِ',
 'فِيهِ هُدًى',
 'هُدًى لِّلْمُتَّقِينَ',
 'لِّلْمُتَّقِينَ ٱلَّذِينَ',
 'ٱلَّذِينَ يُؤْمِنُونَ',
 'يُؤْمِنُونَ بِٱلْغَيْبِ',
 'بِٱلْغَيْبِ وَيُقِيمُونَ',
 'وَيُقِيمُونَ ٱلصَّلَوٰةَ',
 'ٱلصَّلَوٰةَ وَمِمَّا',


In [273]:
df_extend_ngram = pd.DataFrame(n_gram_list, columns=["twogram"])
df_extend_ngram

Unnamed: 0,twogram
0,بِسْمِ ٱللَّهِ
1,ٱللَّهِ ٱلرَّحْمَٰنِ
2,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
3,ٱلرَّحِيمِ ٱلْحَمْدُ
4,ٱلْحَمْدُ لِلَّهِ
...,...
77424,فِى صُدُورِ
77425,صُدُورِ ٱلنَّاسِ
77426,ٱلنَّاسِ مِنَ
77427,مِنَ ٱلْجِنَّةِ


In [274]:
df_extend_ngram = pd.DataFrame(df_extend_ngram.iloc[:,0].value_counts(ascending=False))
df_extend_ngram.reset_index(inplace=True)
df_extend_ngram.rename(columns={"index":"twogram", "twogram":"frequency"}, inplace=True)
df_extend_ngram

Unnamed: 0,twogram,frequency
0,إِنَّ ٱللَّهَ,205
1,فِى ٱلْأَرْضِ,176
2,ٱلَّذِينَ ءَامَنُوا۟,148
3,ٱلَّذِينَ كَفَرُوا۟,115
4,ٱلسَّمَٰوَٰتِ وَٱلْأَرْضِ,95
...,...,...
52610,ٱلرَّأْىِ وَمَا,1
52611,نَرَىٰ لَكُمْ,1
52612,مِن فَضْلٍۭ,1
52613,فَضْلٍۭ بَلْ,1


In [None]:
#df_extend_ngram.to_excel("Arabic_Word_Twogram_Frequency.xlsx", sheet_name="Arabic_Twogram", index=False)

In [279]:
arabic_simple_word_list =  df_word.iloc[:,3].to_list() # arabic_simple

In [280]:
arabic_simple_word_concat = " ".join(arabic_simple_word_list)
arabic_simple_word_concat

'بسم ٱلله ٱلرحمن ٱلرحيم ٱلحمد لله رب ٱلعلمين ٱلرحمن ٱلرحيم ملك يوم ٱلدين إياك نعبد وإياك نستعين ٱهدنا ٱلصرط ٱلمستقيم صرط ٱلذين أنعمت عليهم غير ٱلمغضوب عليهم ولا ٱلضا لين ال م  ذلك ٱلكتب لا ريب فيه هدى للمتقين ٱلذين يؤمنون بٱلغيب ويقيمون ٱلصلوة ومما رزقنهم ينفقون وٱلذين يؤمنون بما  أنزل إليك وما  أنزل من قبلك وبٱلءاخرة هم يوقنون أول ك على هدى من ربهم وأول ك هم ٱلمفلحون إن ٱلذين كفروا سوا ء عليهم ءأنذرتهم أم لم تنذرهم لا يؤمنون ختم ٱلله على قلوبهم وعلى سمعهم وعلى  أبصرهم غشوة ولهم عذاب عظيم ومن ٱلناس من يقول ءامنا بٱلله وبٱليوم ٱلءاخر وما هم بمؤمنين يخدعون ٱلله وٱلذين ءامنوا وما يخدعون إلا  أنفسهم وما يشعرون فى قلوبهم مرض فزادهم ٱلله مرضا ولهم عذاب أليم بما كانوا يكذبون وإذا قيل لهم لا تفسدوا فى ٱلأرض قالو ا إنما نحن مصلحون ألا  إنهم هم ٱلمفسدون ولكن لا يشعرون وإذا قيل لهم ءامنوا كما  ءامن ٱلناس قالو ا أنؤمن كما  ءامن ٱلسفها ء ألا  إنهم هم ٱلسفها ء ولكن لا يعلمون وإذا لقوا ٱلذين ءامنوا قالو ا ءامنا وإذا خلوا إلى شيطينهم قالو ا إنا معكم إنما نحن مستهزءون ٱلله يستهز بهم ويمدهم فى طغينهم يع

In [281]:
n = 2
n_gram = ngrams(arabic_simple_word_concat.split(), n)
n_gram_list = []
for i in n_gram:
    n_gram_group = " ".join(i)
    n_gram_list.append(n_gram_group)

In [282]:
n_gram_list

['بسم ٱلله',
 'ٱلله ٱلرحمن',
 'ٱلرحمن ٱلرحيم',
 'ٱلرحيم ٱلحمد',
 'ٱلحمد لله',
 'لله رب',
 'رب ٱلعلمين',
 'ٱلعلمين ٱلرحمن',
 'ٱلرحمن ٱلرحيم',
 'ٱلرحيم ملك',
 'ملك يوم',
 'يوم ٱلدين',
 'ٱلدين إياك',
 'إياك نعبد',
 'نعبد وإياك',
 'وإياك نستعين',
 'نستعين ٱهدنا',
 'ٱهدنا ٱلصرط',
 'ٱلصرط ٱلمستقيم',
 'ٱلمستقيم صرط',
 'صرط ٱلذين',
 'ٱلذين أنعمت',
 'أنعمت عليهم',
 'عليهم غير',
 'غير ٱلمغضوب',
 'ٱلمغضوب عليهم',
 'عليهم ولا',
 'ولا ٱلضا',
 'ٱلضا لين',
 'لين ال',
 'ال م',
 'م ذلك',
 'ذلك ٱلكتب',
 'ٱلكتب لا',
 'لا ريب',
 'ريب فيه',
 'فيه هدى',
 'هدى للمتقين',
 'للمتقين ٱلذين',
 'ٱلذين يؤمنون',
 'يؤمنون بٱلغيب',
 'بٱلغيب ويقيمون',
 'ويقيمون ٱلصلوة',
 'ٱلصلوة ومما',
 'ومما رزقنهم',
 'رزقنهم ينفقون',
 'ينفقون وٱلذين',
 'وٱلذين يؤمنون',
 'يؤمنون بما',
 'بما أنزل',
 'أنزل إليك',
 'إليك وما',
 'وما أنزل',
 'أنزل من',
 'من قبلك',
 'قبلك وبٱلءاخرة',
 'وبٱلءاخرة هم',
 'هم يوقنون',
 'يوقنون أول',
 'أول ك',
 'ك على',
 'على هدى',
 'هدى من',
 'من ربهم',
 'ربهم وأول',
 'وأول ك',
 'ك هم',
 'هم ٱلمفلحون',
 'ٱلمفل

In [283]:
df_simple_ngram = pd.DataFrame(n_gram_list, columns=["twogram"])
df_simple_ngram

Unnamed: 0,twogram
0,بسم ٱلله
1,ٱلله ٱلرحمن
2,ٱلرحمن ٱلرحيم
3,ٱلرحيم ٱلحمد
4,ٱلحمد لله
...,...
80259,فى صدور
80260,صدور ٱلناس
80261,ٱلناس من
80262,من ٱلجنة


In [284]:
df_simple_ngram = pd.DataFrame(df_simple_ngram.iloc[:,0].value_counts(ascending=False))
df_simple_ngram.reset_index(inplace=True)
df_simple_ngram.rename(columns={"index":"twogram", "twogram":"frequency"}, inplace=True)
df_simple_ngram

Unnamed: 0,twogram,frequency
0,إن ٱلله,205
1,فى ٱلأرض,176
2,ٱلذين ءامنوا,148
3,ي أيها,142
4,ٱلسموت وٱلأرض,133
...,...,...
49820,ٱلأمر وٱستوت,1
49821,وٱستوت على,1
49822,على ٱلجودى,1
49823,ٱلجودى وقيل,1


In [None]:
#df_simple_ngram.to_excel("Arabic_Simple_Word_Twogram_Frequency.xlsx", sheet_name="Arabic_Twogram", index=False)

##### NLTK N Gram From DataFrame

In [340]:
df_buckwalter = df_word.iloc[:,[0,1,2,5]]
df_buckwalter

Unnamed: 0,num_1,num_2,num_3,buckwalter
0,1,1,1,bisomi
1,1,1,2,{ll~ahi
2,1,1,3,{lr~aHoma`ni
3,1,1,4,{lr~aHiymi
4,1,2,1,{loHamodu
...,...,...,...,...
77424,114,5,4,Suduwri
77425,114,5,5,{ln~aAsi
77426,114,6,1,mina
77427,114,6,2,{lojin~api


In [344]:
df_buckwalter.groupby(["num_1","num_2"])["buckwalter"].apply(" ".join).reset_index()

Unnamed: 0,num_1,num_2,buckwalter
0,1,1,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi
1,1,2,{loHamodu lil~ahi rab~i {loEa`lamiyna
2,1,3,{lr~aHoma`ni {lr~aHiymi
3,1,4,ma`liki yawomi {ld~iyni
4,1,5,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu
...,...,...,...
6231,114,2,maliki {ln~aAsi
6232,114,3,<ila`hi {ln~aAsi
6233,114,4,min $ar~i {lowasowaAsi {loxan~aAsi
6234,114,5,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi


In [345]:
#pd.DataFrame(df_buckwalter.groupby(["num_1","num_2"])["buckwalter"].apply(" ".join)).reset_index()

In [346]:
df_buckwalter_concat = df_buckwalter.groupby(["num_1","num_2"])["buckwalter"].apply(" ".join).reset_index()
df_buckwalter_concat

Unnamed: 0,num_1,num_2,buckwalter
0,1,1,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi
1,1,2,{loHamodu lil~ahi rab~i {loEa`lamiyna
2,1,3,{lr~aHoma`ni {lr~aHiymi
3,1,4,ma`liki yawomi {ld~iyni
4,1,5,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu
...,...,...,...
6231,114,2,maliki {ln~aAsi
6232,114,3,<ila`hi {ln~aAsi
6233,114,4,min $ar~i {lowasowaAsi {loxan~aAsi
6234,114,5,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi


In [347]:
df_buckwalter_concat["arabic"] = df_buckwalter_concat.iloc[:,2].apply(lambda x: transString(x, 1)) # Convert Arabic Extended

In [349]:
df_buckwalter_concat["arabic_simple"] = df_buckwalter_concat.iloc[:,2].apply(lambda x : transStringSimple(x, 1)) # Convert Arabic Simple
df_buckwalter_concat["arabic_simple"] = df_buckwalter_concat.iloc[:,4].apply(lambda x : clean(x))
df_buckwalter_concat

Unnamed: 0,num_1,num_2,buckwalter,arabic,arabic_simple
0,1,1,bisomi {ll~ahi {lr~aHoma`ni {lr~aHiymi,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,بسم ٱلله ٱلرحمن ٱلرحيم
1,1,2,{loHamodu lil~ahi rab~i {loEa`lamiyna,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,ٱلحمد لله رب ٱلعلمين
2,1,3,{lr~aHoma`ni {lr~aHiymi,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,ٱلرحمن ٱلرحيم
3,1,4,ma`liki yawomi {ld~iyni,مَٰلِكِ يَوْمِ ٱلدِّينِ,ملك يوم ٱلدين
4,1,5,<iy~aAka naEobudu wa<iy~aAka nasotaEiynu,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,إياك نعبد وإياك نستعين
...,...,...,...,...,...
6231,114,2,maliki {ln~aAsi,مَلِكِ ٱلنَّاسِ,ملك ٱلناس
6232,114,3,<ila`hi {ln~aAsi,إِلَٰهِ ٱلنَّاسِ,إله ٱلناس
6233,114,4,min $ar~i {lowasowaAsi {loxan~aAsi,مِن شَرِّ ٱلْوَسْوَاسِ ٱلْخَنَّاسِ,من شر ٱلوسواس ٱلخناس
6234,114,5,{l~a*iY yuwasowisu fiY Suduwri {ln~aAsi,ٱلَّذِى يُوَسْوِسُ فِى صُدُورِ ٱلنَّاسِ,ٱلذى يوسوس فى صدور ٱلناس


In [350]:
# Arabic_Extended
n = 2
n_gram_not_list = []
n_gram_list = [] 
for i in df_buckwalter_concat.iloc[:,3]:
    try:
        n_gram = ngrams(i.split(), n)
        for j in n_gram:
            j = " ".join(j)
            n_gram_list.append(j)
    except:
        n_gram_not_list.append(i)

In [362]:
pd.DataFrame(n_gram_not_list,columns=["not_proper"])

Unnamed: 0,not_proper


In [357]:
df_arabic_ngram = pd.DataFrame(n_gram_list,columns=["twogram"])
df_arabic_ngram

Unnamed: 0,twogram
0,بِسْمِ ٱللَّهِ
1,ٱللَّهِ ٱلرَّحْمَٰنِ
2,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
3,ٱلْحَمْدُ لِلَّهِ
4,لِلَّهِ رَبِّ
...,...
71189,يُوَسْوِسُ فِى
71190,فِى صُدُورِ
71191,صُدُورِ ٱلنَّاسِ
71192,مِنَ ٱلْجِنَّةِ


In [356]:
df_arabic_ngram = df_arabic_ngram.iloc[:,0].value_counts(ascending=False).reset_index()
df_arabic_ngram.rename(columns={"index":"twogram", "twogram":"frequency"}, inplace=True)
df_arabic_ngram

Unnamed: 0,twogram,frequency
0,إِنَّ ٱللَّهَ,205
1,فِى ٱلْأَرْضِ,176
2,ٱلَّذِينَ ءَامَنُوا۟,148
3,ٱلَّذِينَ كَفَرُوا۟,115
4,ٱلسَّمَٰوَٰتِ وَٱلْأَرْضِ,95
...,...,...
46846,تَأْوِيلُهُۥ كَذَٰلِكَ,1
46847,قَبْلِهِمْ فَٱنظُرْ,1
46848,مَّن يُؤْمِنُ,1
46849,مَّن لَّا,1


In [None]:
#df_arabic_ngram.to_excel("Arabic_DataFrame_Twogram_Frequency.xlsx", sheet_name="Arabic_Twogram", index=False)

In [358]:
# Arabic_Simple
n = 2
n_gram_not_list = []
n_gram_list = [] 
for i in df_buckwalter_concat.iloc[:,4]:
    try:
        n_gram = ngrams(i.split(), n)
        for j in n_gram:
            j = " ".join(j)
            n_gram_list.append(j)
    except:
        n_gram_not_list.append(i)

In [361]:
pd.DataFrame(n_gram_not_list,columns=["not_proper"])

Unnamed: 0,not_proper


In [359]:
df_arabic_simple_ngram = pd.DataFrame(n_gram_list,columns=["twogram"])
df_arabic_simple_ngram

Unnamed: 0,twogram
0,بسم ٱلله
1,ٱلله ٱلرحمن
2,ٱلرحمن ٱلرحيم
3,ٱلحمد لله
4,لله رب
...,...
74024,يوسوس فى
74025,فى صدور
74026,صدور ٱلناس
74027,من ٱلجنة


In [360]:
df_arabic_simple_ngram = df_arabic_simple_ngram.iloc[:,0].value_counts(ascending=False).reset_index()
df_arabic_simple_ngram.rename(columns={"index":"twogram", "twogram":"frequency"}, inplace=True)
df_arabic_simple_ngram

Unnamed: 0,twogram,frequency
0,إن ٱلله,205
1,فى ٱلأرض,176
2,ٱلذين ءامنوا,148
3,ي أيها,142
4,ٱلسموت وٱلأرض,133
...,...,...
44183,ربكم وشفا,1
44184,ءتكم موعظة,1
44185,ويميت وإليه,1
44186,وٱلأرض ألا,1


In [None]:
#df_arabic_simple_ngram.to_excel("Arabic_Simple_DataFrame_Twogram_Frequency.xlsx", sheet_name="Arabic_Simple_Twogram", index=False)