### Flash Cards Data Analysis

In [10]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [None]:
# buckwalter text clean for simple before transString
def clean_ex(text):
    buckwalter_out = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio`~"%-])''', "", text) # difference from clean
    return buckwalter_out

In [None]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [None]:
# Simple Technique
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2unisimple = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "{": u"\u0671", # waSla         
}

def transStringSimple(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2unisimple.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [None]:
# character clean for simple after transStringSimple 
def clean(text):
    arabic_out = re.sub(r'''([PJVG\.:;,!\+\]\[@#FNKauio`~"%-])''', "", text) # .$^*+
    arabic_out2 = re.sub(r"\^", " ", arabic_out)
    return arabic_out2

#### Arabic Word Select

In [11]:
df_arabic_translate = pd.read_excel("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Flash Cards/Data/All_Surah_Translate_File_Concat.xlsx")
df_arabic_translate

Unnamed: 0,surah,surah latin,arabic,transliterate,translate_english,translate_urdu,translate_hindi,translate_indonesian,translate_bangla,translate_turkish,translate_russian,index
0,1,Al-Fatiha,بِسۡمِ,bis'mi,In (the) name,ساتھ نام,साथ नाम,dengan nama,নামে,adıyla,С именем,0
1,1,Al-Fatiha,ٱللَّهِ,al-lahi,(of) Allah,اللہ کے,अल्लाह के,Allah,আল্লাহ (র),Allah'ın,"Аллаха,",1
2,1,Al-Fatiha,ٱلرَّحۡمَٰنِ,al-rahmani,the Most Gracious,جو بے حد مہربان ہے,जो बहुत मेहरबान,Maha Pengasih,পরম করুণাময়,Rahman,"Милостивого,",2
3,1,Al-Fatiha,ٱلرَّحِيمِ,al-rahimi,the Most Merciful,بار بار رحم فرمانے والا ہے,निहायत रहम करने वाला है,Maha Penyayang,অসীম দয়ালু,Rahim,Милосердного!,3
4,1,Al-Fatiha,ٱلۡحَمۡدُ,al-hamdu,All praises and thanks,سب تعریف,सब तारीफ़,pujian,সকল প্রশংসা,hamdolsun,Хвала,4
...,...,...,...,...,...,...,...,...,...,...,...,...
77424,114,An-Nas,صُدُورِ,suduri,(the) breasts,سینوں,सीनों में,dada,অন্তরসমূহের,*,грудях,78244
77425,114,An-Nas,ٱلنَّاسِ,al-nasi,(of) mankind,انسانوں کے,लोगों के,manusia,মানুষের,insanların,"людей,",78245
77426,114,An-Nas,مِنَ,mina,From,سے,*,dari,মধ্য হতে,cinlerden,(будучи) из (числа),78246
77427,114,An-Nas,ٱلۡجِنَّةِ,al-jinati,the jinn,جنوں میں,जिन्नों में से,jin,জিনের,*,джиннов,78247


In [12]:
df_arabic_word_count = df_arabic_translate["arabic"].value_counts(ascending=False).reset_index()
df_arabic_word_count.rename(columns={"index":"arabic","arabic":"arabic_count"}, inplace=True)
df_arabic_word_count = df_arabic_word_count.head(300)
df_arabic_word_count

Unnamed: 0,arabic,arabic_count
0,فِي,1096
1,ٱلَّذِينَ,810
2,مِن,728
3,مَا,709
4,ٱللَّهِ,667
...,...,...
295,بِهِۦٓ,31
296,عَنۡهُ,31
297,أُوتُواْ,31
298,بَنِيٓ,31


In [13]:
df_word_translate_select = df_arabic_translate.iloc[:,[2,3,4,5,6,7,8,9,10]]
df_word_translate_select.drop_duplicates(inplace=True)
df_word_translate_select

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,arabic,transliterate,translate_english,translate_urdu,translate_hindi,translate_indonesian,translate_bangla,translate_turkish,translate_russian
0,بِسۡمِ,bis'mi,In (the) name,ساتھ نام,साथ नाम,dengan nama,নামে,adıyla,С именем
1,ٱللَّهِ,al-lahi,(of) Allah,اللہ کے,अल्लाह के,Allah,আল্লাহ (র),Allah'ın,"Аллаха,"
2,ٱلرَّحۡمَٰنِ,al-rahmani,the Most Gracious,جو بے حد مہربان ہے,जो बहुत मेहरबान,Maha Pengasih,পরম করুণাময়,Rahman,"Милостивого,"
3,ٱلرَّحِيمِ,al-rahimi,the Most Merciful,بار بار رحم فرمانے والا ہے,निहायत रहम करने वाला है,Maha Penyayang,অসীম দয়ালু,Rahim,Милосердного!
4,ٱلۡحَمۡدُ,al-hamdu,All praises and thanks,سب تعریف,सब तारीफ़,pujian,সকল প্রশংসা,hamdolsun,Хвала
...,...,...,...,...,...,...,...,...,...
77424,صُدُورِ,suduri,(the) breasts,سینوں,सीनों में,dada,অন্তরসমূহের,*,грудях
77425,ٱلنَّاسِ,al-nasi,(of) mankind,انسانوں کے,लोगों के,manusia,মানুষের,insanların,"людей,"
77426,مِنَ,mina,From,سے,*,dari,মধ্য হতে,cinlerden,(будучи) из (числа)
77427,ٱلۡجِنَّةِ,al-jinati,the jinn,جنوں میں,जिन्नों में से,jin,জিনের,*,джиннов


In [14]:
df_arabic_count_translate_merge = pd.merge(df_arabic_word_count, df_word_translate_select, how="left", on="arabic")
df_arabic_count_translate_merge.drop_duplicates(inplace=True)
df_arabic_count_translate_merge 

Unnamed: 0,arabic,arabic_count,transliterate,translate_english,translate_urdu,translate_hindi,translate_indonesian,translate_bangla,translate_turkish,translate_russian
0,فِي,1096,fi,In,میں,*,di dalam,মধ্যে আছে,onların kablerinde,В
1,فِي,1096,fi,in,میں,*,di,মধ্যে,yeryüzünde,на
2,فِي,1096,fi,in,ان کی,*,dalam,মধ্যে,içinde,в
3,فِي,1096,fi,in,میں,*,dalam,মধ্যে,içinde,во
4,فِي,1096,fi,in,میں,*,dalam,মধ্যে,içinde,в
...,...,...,...,...,...,...,...,...,...,...
25561,تُكَذِّبَانِ,31,tukadhibani,will you both deny,تم دونوں جھٹلاؤ گے,तुम दोनों झुठलाओगे,kamu berdua mendustakan,উভয়ে অস্বীকার করবে,yalanlıyorsunuz,вы (двое) сочтёте ложным?
25562,تُكَذِّبَانِ,31,tukadhibani,will you both deny,تم دونوں تم دونوں جھٹلاؤ گے,तुम दोनों झुठलाओगे,kamu berdua dustakan,উভয়ে অস্বীকার করবে,yalanlıyorsunuz,вы (двое) сочтёте ложным?
25563,تُكَذِّبَانِ,31,tukadhibani,will you both deny,تم دونوں جھٹلاؤ گے,तुम दोनों झुठलाओगे,kamu berdua dustakan,উভয়ে অস্বীকার করবে,yalanlıyorsunuz,вы (двое) сочтёте ложным?
25564,تُكَذِّبَانِ,31,tukadhibani,will you both deny,تم دونوں جھٹلاؤ گے,तुम दोनों झुठलाओगे,kamu berdua dustakan,উভয়ে অস্বীকার করবে,yalanlıyorsunuz,вы (двое) сочтёте ложным?


In [16]:
df_arabic_count_translate_merge.to_excel("Quran_Arabic_Word_Translate_Selected.xlsx", index=False) 

In [None]:
df_arabic_count_translate_merge["arabic_simple"] = df_arabic_count_translate_merge.loc[:,"arabic"].apply(lambda x : clean_ex(x)) # Convert Arabic Simple
df_arabic_count_translate_merge["arabic_simple"] = df_arabic_count_translate_merge.loc[:,"arabic_simple"].apply(lambda x : transString(x,1))
df_arabic_count_translate_merge

#### Arabic Root With Related Arabic Word

In [None]:
df_master = pd.read_excel("")