### Morphology Root Analysis

In [None]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [None]:
# buckwalter text clean for simple before transString
def clean_ex(text):
    buckwalter_out = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio`~"%-])''', "", text) # difference from clean
    return buckwalter_out

In [None]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [None]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quaran/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text_line = file.readlines()

In [None]:
text_line

In [None]:
df_morp_all = pd.DataFrame(text_line)
df_morp_all

In [None]:
df_morp_all = df_morp_all[0].str.split("\t", n=-1, expand=True)
df_morp_all

In [None]:
df_morp_all[3] = df_morp_all[3].apply(lambda x: x.strip("\n"))
df_morp_all

In [None]:
df_morp_all.columns = df_morp_all.iloc[0]
df_morp_all.drop(df_morp_all.index[0], inplace=True)
df_morp_all.iloc[:,0] = df_morp_all.iloc[:,0].apply(lambda x: x.strip("()"))
df_morp_all

In [None]:
df_morp_root = df_morp_all[df_morp_all.FEATURES.str.contains("ROOT")]
df_morp_root

In [None]:
df_morp_root.iloc[:,3] = df_morp_root.iloc[:,3].apply(lambda x: x+"|")
df_morp_root

In [None]:
df_morp_root["ROOT"] = df_morp_root.FEATURES.apply(lambda x: re.findall(r"\|ROOT:(.*?)\|", x))
df_morp_root

In [None]:
df_morp_root.loc[:,"ROOT"] = df_morp_root.loc[:,"ROOT"].apply(lambda x: "".join(x))
df_morp_root

In [None]:
df_morp_root.columns = map(str.lower,df_morp_root.columns) # OR df_morp_root.columns = [x.lower() for x in df_morp_root.columns]

In [None]:
df_morp_root["root_arabic"] = df_morp_root.iloc[:,4].apply(lambda x: transString(x, 1))
df_morp_root

In [None]:
df_morp_root_nav = df_morp_root.iloc[:,0].str.split(":", n=-1, expand=True)
df_morp_root_nav.rename(columns={0:"num_1",1:"num_2",2:"num_3",3:"num_4"}, inplace=True)
df_morp_root_nav

In [None]:
df_morp_root_nav[["form","tag","root","root_arabic"]] = df_morp_root.iloc[:,[1,2,4,5]]
df_morp_root_nav

In [None]:
#df_morp_root_nav.to_excel("Arabic_Root_Morphology.xlsx", sheet_name="Arabic_Root", index=False) 

In [None]:
df_morp_root_nav["nav"] = df_morp_root_nav["num_1"].astype("str")+":"+df_morp_root_nav["num_2"].astype("str")+":"+df_morp_root_nav["num_3"].astype("str")
df_morp_root_nav

In [None]:
df_morp_root_nav = df_morp_root_nav.iloc[:,[4,5,6,7,8]]
df_morp_root_nav

In [None]:
#df_morp_root_nav.to_excel("Arabic_Root_Navigation_Morphology.xlsx", sheet_name="Arabic_Root", index=False)

In [None]:
df_morp_root_count = pd.DataFrame(df_morp_root_nav.root.value_counts(ascending=False))
df_morp_root_count.reset_index(inplace=True)
df_morp_root_count.rename(columns={"index":"root","root":"frequency"}, inplace=True)
df_morp_root_count

In [None]:
df_morp_root_count["root_arabic"] = df_morp_root_count["root"].apply(lambda x: transString(x, 1))
df_morp_root_count = df_morp_root_count.iloc[:,[0,2,1]]
df_morp_root_count

In [None]:
#df_morp_root_count.to_excel("Arabic_Root_Frequency_Morphology.xlsx", sheet_name="Arabic_Root_Frequency", index=False)

In [None]:
df_morp_root_nav_freq = pd.merge(df_morp_root_nav,df_morp_root_count,how="inner",on="root")
df_morp_root_nav_freq

In [None]:
df_morp_root_freq_nav_all = df_morp_root_nav_freq.groupby(["root","root_arabic_x","frequency"])["nav"].apply(", ".join).reset_index().sort_values(by="frequency", ascending=False)
df_morp_root_freq_nav_all.rename(columns={"root_arabic_x":"root_arabic"}, inplace=True)
df_morp_root_freq_nav_all.reset_index(drop=True, inplace=True)
df_morp_root_freq_nav_all

In [None]:
#df_morp_root_freq_nav_all.to_excel("Arabic_Root_Frequency_And_Nav_Morphology.xlsx", sheet_name="Arabic_Root_Frequency_Nav", index=False)