### Morphology Root Analysis

This notebook was used to derive word roots from the Quran's morphology data.

In [1]:
import pandas as pd
import numpy as np
import re
from camel_tools.utils.charmap import CharMapper
from lang_trans.arabic import buckwalter

In [2]:
# buckwalter text clean for simple before transString
def clean_ex(text):
    buckwalter_out = re.sub(r'''([PJVG\.:;,!\+\^\]\[@#FNKauio`~"%-])''', "", text) # difference from clean
    return buckwalter_out

In [3]:
# Arabic Encoding Extended
# -*- coding: utf-8 -*-

# Arabic Transliteration based on Buckwalter
# dictionary source is buckwalter2unicode.py http://www.redhat.com/archives/fedora-extras-commits/2007-June/msg03617.html 

buck2uni = {"'": u"\u0621", # hamza-on-the-line
            "|": u"\u0622", # madda
            ">": u"\u0623", # hamza-on-'alif
            "&": u"\u0624", # hamza-on-waaw
            "<": u"\u0625", # hamza-under-'alif
            "}": u"\u0626", # hamza-on-yaa'
            "A": u"\u0627", # bare 'alif
            "b": u"\u0628", # baa'
            "p": u"\u0629", # taa' marbuuTa
            "t": u"\u062A", # taa'
            "v": u"\u062B", # thaa'
            "j": u"\u062C", # jiim
            "H": u"\u062D", # Haa'
            "x": u"\u062E", # khaa'
            "d": u"\u062F", # daal
            "*": u"\u0630", # dhaal
            "r": u"\u0631", # raa'
            "z": u"\u0632", # zaay
            "s": u"\u0633", # siin
            "$": u"\u0634", # shiin
            "S": u"\u0635", # Saad
            "D": u"\u0636", # Daad
            "T": u"\u0637", # Taa'
            "Z": u"\u0638", # Zaa' (DHaa')
            "E": u"\u0639", # cayn
            "g": u"\u063A", # ghayn
            "_": u"\u0640", # taTwiil
            "f": u"\u0641", # faa'
            "q": u"\u0642", # qaaf
            "k": u"\u0643", # kaaf
            "l": u"\u0644", # laam
            "m": u"\u0645", # miim
            "n": u"\u0646", # nuun
            "h": u"\u0647", # haa'
            "w": u"\u0648", # waaw
            "Y": u"\u0649", # 'alif maqSuura
            "y": u"\u064A", # yaa'
            "F": u"\u064B", # fatHatayn
            "N": u"\u064C", # Dammatayn
            "K": u"\u064D", # kasratayn
            "a": u"\u064E", # fatHa
            "u": u"\u064F", # Damma
            "i": u"\u0650", # kasra
            "~": u"\u0651", # shaddah
            "o": u"\u0652", # sukuun
            "^": u"\u0653", # maddah
            "#": u"\u0654", # hamzaabove            
            "`": u"\u0670", # dagger 'alif
            "{": u"\u0671", # waSla
            "P": u"\u067E", # arabicletterpeh
            "J": u"\u0686", 
            "V": u"\u06A4", # arabicletterveh
            "G": u"\u06AF", # arabiclettergaf
            ":": u"\u06DC", # smallhighseen
            "@": u"\u06DF", # smallhighroundedzero
            "\"": u"\u06E0", # smallhighuprightrectangularzero
            "[": u"\u06E2", # smallhighmeemisolatedform
            ";": u"\u06E3", # smalllowseen
            ",": u"\u06E5", # smallwaw
            ".": u"\u06E6", # smallya
            "!": u"\u06E8", # smallhighnoon
            "-": u"\u06EA", # emptycentrelowstop
            "+": u"\u06EB", # emptycentrehighstop
            "%": u"\u06EC", # roundedhighstopwithfilledcentre
            "]": u"\u06ED", # smalllowmeem           
}

def transString(string, reverse=0):
    '''Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1'''

    for k, v in buck2uni.items():
      if not reverse:
            string = string.replace(v, k)
      else:
            string = string.replace(k, v)

    return string

In [4]:
with open("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Arabic/Quran/Text/Data/Quranic Corpus Morphology.txt", "r", encoding="utf8") as file:
    text_line = file.readlines()

In [5]:
text_line

['LOCATION\tFORM\tTAG\tFEATURES\n',
 '(1:1:1:1)\tbi\tP\tPREFIX|bi+\n',
 '(1:1:1:2)\tsomi\tN\tSTEM|POS:N|LEM:{som|ROOT:smw|M|GEN\n',
 '(1:1:2:1)\t{ll~ahi\tPN\tSTEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN\n',
 '(1:1:3:1)\t{l\tDET\tPREFIX|Al+\n',
 '(1:1:3:2)\tr~aHoma`ni\tADJ\tSTEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN\n',
 '(1:1:4:1)\t{l\tDET\tPREFIX|Al+\n',
 '(1:1:4:2)\tr~aHiymi\tADJ\tSTEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN\n',
 '(1:2:1:1)\t{lo\tDET\tPREFIX|Al+\n',
 '(1:2:1:2)\tHamodu\tN\tSTEM|POS:N|LEM:Hamod|ROOT:Hmd|M|NOM\n',
 '(1:2:2:1)\tli\tP\tPREFIX|l:P+\n',
 '(1:2:2:2)\tl~ahi\tPN\tSTEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN\n',
 '(1:2:3:1)\trab~i\tN\tSTEM|POS:N|LEM:rab~|ROOT:rbb|M|GEN\n',
 '(1:2:4:1)\t{lo\tDET\tPREFIX|Al+\n',
 '(1:2:4:2)\tEa`lamiyna\tN\tSTEM|POS:N|LEM:Ea`lamiyn|ROOT:Elm|MP|GEN\n',
 '(1:3:1:1)\t{l\tDET\tPREFIX|Al+\n',
 '(1:3:1:2)\tr~aHoma`ni\tADJ\tSTEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN\n',
 '(1:3:2:1)\t{l\tDET\tPREFIX|Al+\n',
 '(1:3:2:2)\tr~aHiymi\tADJ\tSTEM|POS:ADJ|LEM:r~aHiym

In [6]:
df_morp_all = pd.DataFrame(text_line)
df_morp_all

Unnamed: 0,0
0,LOCATION\tFORM\tTAG\tFEATURES\n
1,(1:1:1:1)\tbi\tP\tPREFIX|bi+\n
2,(1:1:1:2)\tsomi\tN\tSTEM|POS:N|LEM:{som|ROOT:s...
3,(1:1:2:1)\t{ll~ahi\tPN\tSTEM|POS:PN|LEM:{ll~ah...
4,(1:1:3:1)\t{l\tDET\tPREFIX|Al+\n
...,...
128215,(114:6:2:1)\t{lo\tDET\tPREFIX|Al+\n
128216,(114:6:2:2)\tjin~api\tN\tSTEM|POS:N|LEM:jin~ap...
128217,(114:6:3:1)\twa\tCONJ\tPREFIX|w:CONJ+\n
128218,(114:6:3:2)\t{l\tDET\tPREFIX|Al+\n


In [7]:
df_morp_all = df_morp_all[0].str.split("\t", n=-1, expand=True)
df_morp_all

Unnamed: 0,0,1,2,3
0,LOCATION,FORM,TAG,FEATURES\n
1,(1:1:1:1),bi,P,PREFIX|bi+\n
2,(1:1:1:2),somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN\n
3,(1:1:2:1),{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN\n
4,(1:1:3:1),{l,DET,PREFIX|Al+\n
...,...,...,...,...
128215,(114:6:2:1),{lo,DET,PREFIX|Al+\n
128216,(114:6:2:2),jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN\n
128217,(114:6:3:1),wa,CONJ,PREFIX|w:CONJ+\n
128218,(114:6:3:2),{l,DET,PREFIX|Al+\n


In [8]:
df_morp_all[3] = df_morp_all[3].apply(lambda x: x.strip("\n"))
df_morp_all

Unnamed: 0,0,1,2,3
0,LOCATION,FORM,TAG,FEATURES
1,(1:1:1:1),bi,P,PREFIX|bi+
2,(1:1:1:2),somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN
3,(1:1:2:1),{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN
4,(1:1:3:1),{l,DET,PREFIX|Al+
...,...,...,...,...
128215,(114:6:2:1),{lo,DET,PREFIX|Al+
128216,(114:6:2:2),jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN
128217,(114:6:3:1),wa,CONJ,PREFIX|w:CONJ+
128218,(114:6:3:2),{l,DET,PREFIX|Al+


In [9]:
df_morp_all.columns = df_morp_all.iloc[0]
df_morp_all.drop(df_morp_all.index[0], inplace=True)
df_morp_all.iloc[:,0] = df_morp_all.iloc[:,0].apply(lambda x: x.strip("()"))
df_morp_all

Unnamed: 0,LOCATION,FORM,TAG,FEATURES
1,1:1:1:1,bi,P,PREFIX|bi+
2,1:1:1:2,somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN
3,1:1:2:1,{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN
4,1:1:3:1,{l,DET,PREFIX|Al+
5,1:1:3:2,r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN
...,...,...,...,...
128215,114:6:2:1,{lo,DET,PREFIX|Al+
128216,114:6:2:2,jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN
128217,114:6:3:1,wa,CONJ,PREFIX|w:CONJ+
128218,114:6:3:2,{l,DET,PREFIX|Al+


In [10]:
df_morp_root = df_morp_all[df_morp_all.FEATURES.str.contains("ROOT")]
df_morp_root

Unnamed: 0,LOCATION,FORM,TAG,FEATURES
2,1:1:1:2,somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN
3,1:1:2:1,{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN
5,1:1:3:2,r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN
7,1:1:4:2,r~aHiymi,ADJ,STEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN
9,1:2:1:2,Hamodu,N,STEM|POS:N|LEM:Hamod|ROOT:Hmd|M|NOM
...,...,...,...,...
128209,114:5:2:1,yuwasowisu,V,STEM|POS:V|IMPF|LEM:wasowasa|ROOT:wsws|3MS
128211,114:5:4:1,Suduwri,N,STEM|POS:N|LEM:Sador|ROOT:Sdr|MP|GEN
128213,114:5:5:2,n~aAsi,N,STEM|POS:N|LEM:n~aAs|ROOT:nws|MP|GEN
128216,114:6:2:2,jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN


In [11]:
df_morp_root.iloc[:,3] = df_morp_root.iloc[:,3].apply(lambda x: x+"|")
df_morp_root

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


Unnamed: 0,LOCATION,FORM,TAG,FEATURES
2,1:1:1:2,somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN|
3,1:1:2:1,{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN|
5,1:1:3:2,r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN|
7,1:1:4:2,r~aHiymi,ADJ,STEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN|
9,1:2:1:2,Hamodu,N,STEM|POS:N|LEM:Hamod|ROOT:Hmd|M|NOM|
...,...,...,...,...
128209,114:5:2:1,yuwasowisu,V,STEM|POS:V|IMPF|LEM:wasowasa|ROOT:wsws|3MS|
128211,114:5:4:1,Suduwri,N,STEM|POS:N|LEM:Sador|ROOT:Sdr|MP|GEN|
128213,114:5:5:2,n~aAsi,N,STEM|POS:N|LEM:n~aAs|ROOT:nws|MP|GEN|
128216,114:6:2:2,jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN|


In [12]:
df_morp_root["ROOT"] = df_morp_root.FEATURES.apply(lambda x: re.findall(r"\|ROOT:(.*?)\|", x))
df_morp_root

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_morp_root["ROOT"] = df_morp_root.FEATURES.apply(lambda x: re.findall(r"\|ROOT:(.*?)\|", x))


Unnamed: 0,LOCATION,FORM,TAG,FEATURES,ROOT
2,1:1:1:2,somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN|,[smw]
3,1:1:2:1,{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN|,[Alh]
5,1:1:3:2,r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN|,[rHm]
7,1:1:4:2,r~aHiymi,ADJ,STEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN|,[rHm]
9,1:2:1:2,Hamodu,N,STEM|POS:N|LEM:Hamod|ROOT:Hmd|M|NOM|,[Hmd]
...,...,...,...,...,...
128209,114:5:2:1,yuwasowisu,V,STEM|POS:V|IMPF|LEM:wasowasa|ROOT:wsws|3MS|,[wsws]
128211,114:5:4:1,Suduwri,N,STEM|POS:N|LEM:Sador|ROOT:Sdr|MP|GEN|,[Sdr]
128213,114:5:5:2,n~aAsi,N,STEM|POS:N|LEM:n~aAs|ROOT:nws|MP|GEN|,[nws]
128216,114:6:2:2,jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN|,[jnn]


In [13]:
df_morp_root.loc[:,"ROOT"] = df_morp_root.loc[:,"ROOT"].apply(lambda x: "".join(x))
df_morp_root

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,LOCATION,FORM,TAG,FEATURES,ROOT
2,1:1:1:2,somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN|,smw
3,1:1:2:1,{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN|,Alh
5,1:1:3:2,r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN|,rHm
7,1:1:4:2,r~aHiymi,ADJ,STEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN|,rHm
9,1:2:1:2,Hamodu,N,STEM|POS:N|LEM:Hamod|ROOT:Hmd|M|NOM|,Hmd
...,...,...,...,...,...
128209,114:5:2:1,yuwasowisu,V,STEM|POS:V|IMPF|LEM:wasowasa|ROOT:wsws|3MS|,wsws
128211,114:5:4:1,Suduwri,N,STEM|POS:N|LEM:Sador|ROOT:Sdr|MP|GEN|,Sdr
128213,114:5:5:2,n~aAsi,N,STEM|POS:N|LEM:n~aAs|ROOT:nws|MP|GEN|,nws
128216,114:6:2:2,jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN|,jnn


In [14]:
df_morp_root.columns = map(str.lower,df_morp_root.columns) # OR df_morp_root.columns = [x.lower() for x in df_morp_root.columns]

In [15]:
df_morp_root["root_arabic"] = df_morp_root.iloc[:,4].apply(lambda x: transString(x, 1))
df_morp_root

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_morp_root["root_arabic"] = df_morp_root.iloc[:,4].apply(lambda x: transString(x, 1))


Unnamed: 0,location,form,tag,features,root,root_arabic
2,1:1:1:2,somi,N,STEM|POS:N|LEM:{som|ROOT:smw|M|GEN|,smw,سمو
3,1:1:2:1,{ll~ahi,PN,STEM|POS:PN|LEM:{ll~ah|ROOT:Alh|GEN|,Alh,اله
5,1:1:3:2,r~aHoma`ni,ADJ,STEM|POS:ADJ|LEM:r~aHoma`n|ROOT:rHm|MS|GEN|,rHm,رحم
7,1:1:4:2,r~aHiymi,ADJ,STEM|POS:ADJ|LEM:r~aHiym|ROOT:rHm|MS|GEN|,rHm,رحم
9,1:2:1:2,Hamodu,N,STEM|POS:N|LEM:Hamod|ROOT:Hmd|M|NOM|,Hmd,حمد
...,...,...,...,...,...,...
128209,114:5:2:1,yuwasowisu,V,STEM|POS:V|IMPF|LEM:wasowasa|ROOT:wsws|3MS|,wsws,وسوس
128211,114:5:4:1,Suduwri,N,STEM|POS:N|LEM:Sador|ROOT:Sdr|MP|GEN|,Sdr,صدر
128213,114:5:5:2,n~aAsi,N,STEM|POS:N|LEM:n~aAs|ROOT:nws|MP|GEN|,nws,نوس
128216,114:6:2:2,jin~api,N,STEM|POS:N|LEM:jin~ap|ROOT:jnn|F|GEN|,jnn,جنن


In [16]:
df_morp_root_nav = df_morp_root.iloc[:,0].str.split(":", n=-1, expand=True)
df_morp_root_nav.rename(columns={0:"num_1",1:"num_2",2:"num_3",3:"num_4"}, inplace=True)
df_morp_root_nav

Unnamed: 0,num_1,num_2,num_3,num_4
2,1,1,1,2
3,1,1,2,1
5,1,1,3,2
7,1,1,4,2
9,1,2,1,2
...,...,...,...,...
128209,114,5,2,1
128211,114,5,4,1
128213,114,5,5,2
128216,114,6,2,2


In [17]:
df_morp_root_nav[["form","tag","root","root_arabic"]] = df_morp_root.iloc[:,[1,2,4,5]]
df_morp_root_nav

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag,root,root_arabic
2,1,1,1,2,somi,N,smw,سمو
3,1,1,2,1,{ll~ahi,PN,Alh,اله
5,1,1,3,2,r~aHoma`ni,ADJ,rHm,رحم
7,1,1,4,2,r~aHiymi,ADJ,rHm,رحم
9,1,2,1,2,Hamodu,N,Hmd,حمد
...,...,...,...,...,...,...,...,...
128209,114,5,2,1,yuwasowisu,V,wsws,وسوس
128211,114,5,4,1,Suduwri,N,Sdr,صدر
128213,114,5,5,2,n~aAsi,N,nws,نوس
128216,114,6,2,2,jin~api,N,jnn,جنن


In [18]:
df_morp_root_nav.to_excel("Arabic_Buckwalter_Form_Root_Morphology.xlsx", sheet_name="Arabic_Buckwalter_Form_Root", index=False) 

In [19]:
df_morp_root_nav["nav"] = df_morp_root_nav["num_1"].astype("str")+":"+df_morp_root_nav["num_2"].astype("str")+":"+df_morp_root_nav["num_3"].astype("str")
df_morp_root_nav

Unnamed: 0,num_1,num_2,num_3,num_4,form,tag,root,root_arabic,nav
2,1,1,1,2,somi,N,smw,سمو,1:1:1
3,1,1,2,1,{ll~ahi,PN,Alh,اله,1:1:2
5,1,1,3,2,r~aHoma`ni,ADJ,rHm,رحم,1:1:3
7,1,1,4,2,r~aHiymi,ADJ,rHm,رحم,1:1:4
9,1,2,1,2,Hamodu,N,Hmd,حمد,1:2:1
...,...,...,...,...,...,...,...,...,...
128209,114,5,2,1,yuwasowisu,V,wsws,وسوس,114:5:2
128211,114,5,4,1,Suduwri,N,Sdr,صدر,114:5:4
128213,114,5,5,2,n~aAsi,N,nws,نوس,114:5:5
128216,114,6,2,2,jin~api,N,jnn,جنن,114:6:2


In [20]:
df_morp_root_nav = df_morp_root_nav.iloc[:,[4,5,6,7,8]]
df_morp_root_nav

Unnamed: 0,form,tag,root,root_arabic,nav
2,somi,N,smw,سمو,1:1:1
3,{ll~ahi,PN,Alh,اله,1:1:2
5,r~aHoma`ni,ADJ,rHm,رحم,1:1:3
7,r~aHiymi,ADJ,rHm,رحم,1:1:4
9,Hamodu,N,Hmd,حمد,1:2:1
...,...,...,...,...,...
128209,yuwasowisu,V,wsws,وسوس,114:5:2
128211,Suduwri,N,Sdr,صدر,114:5:4
128213,n~aAsi,N,nws,نوس,114:5:5
128216,jin~api,N,jnn,جنن,114:6:2


In [21]:
df_morp_root_nav.to_excel("Arabic_Buckwalter_Form_Root_Navigation_Morphology.xlsx", sheet_name="Arabic_Buckwalter_Form_Root_Nav", index=False)

In [22]:
df_morp_root_count = pd.DataFrame(df_morp_root_nav.root.value_counts(ascending=False))
df_morp_root_count.reset_index(inplace=True)
df_morp_root_count.rename(columns={"index":"root","root":"frequency"}, inplace=True)
df_morp_root_count

Unnamed: 0,root,frequency
0,Alh,2851
1,qwl,1722
2,kwn,1390
3,rbb,980
4,Amn,879
...,...,...
1637,h$$,1
1638,nEl,1
1639,xlE,1
1640,vry,1


In [23]:
df_morp_root_count["root_arabic"] = df_morp_root_count["root"].apply(lambda x: transString(x, 1))
df_morp_root_count = df_morp_root_count.iloc[:,[0,2,1]]
df_morp_root_count

Unnamed: 0,root,root_arabic,frequency
0,Alh,اله,2851
1,qwl,قول,1722
2,kwn,كون,1390
3,rbb,ربب,980
4,Amn,امن,879
...,...,...,...
1637,h$$,هشش,1
1638,nEl,نعل,1
1639,xlE,خلع,1
1640,vry,ثري,1


In [24]:
df_morp_root_count.to_excel("Arabic_Buckwalter_Root_Freq_Morphology.xlsx", sheet_name="Arabic_Buckwalter_Root_Freq", index=False)

In [25]:
df_morp_root_count_200 = df_morp_root_count.head(200)
df_morp_root_count_200

Unnamed: 0,root,root_arabic,frequency
0,Alh,اله,2851
1,qwl,قول,1722
2,kwn,كون,1390
3,rbb,ربب,980
4,Amn,امن,879
...,...,...,...
195,ndw,ندو,53
196,xbr,خبر,52
197,DEf,ضعف,52
198,Hll,حلل,51


In [26]:
df_morp_root_count_200.to_excel("Arabic_Buckwalter_200_Root_Freq_Morphology.xlsx", sheet_name="Arabic_Buckwalter_Root_Freq", index=False)

In [27]:
df_morp_root_nav_freq = pd.merge(df_morp_root_nav,df_morp_root_count,how="inner",on="root")
df_morp_root_nav_freq

Unnamed: 0,form,tag,root,root_arabic_x,nav,root_arabic_y,frequency
0,somi,N,smw,سمو,1:1:1,سمو,381
1,s~amaA^'i,N,smw,سمو,2:19:4,سمو,381
2,s~amaA^'a,N,smw,سمو,2:22:6,سمو,381
3,s~amaA^'i,N,smw,سمو,2:22:10,سمو,381
4,s~amaA^'i,N,smw,سمو,2:29:12,سمو,381
...,...,...,...,...,...,...,...
49963,m~asadK],N,msd,مسد,111:5:5,مسد,1
49964,S~amadu,N,Smd,صمد,112:2:2,صمد,1
49965,kufuwFA,N,kfA,كفا,112:4:4,كفا,1
49966,waqaba,V,wqb,وقب,113:3:5,وقب,1


In [28]:
df_morp_root_freq_nav_all = df_morp_root_nav_freq.groupby(["root","root_arabic_x","frequency"])["nav"].apply(", ".join).reset_index().sort_values(by="frequency", ascending=False)
df_morp_root_freq_nav_all.rename(columns={"root_arabic_x":"root_arabic"}, inplace=True)
df_morp_root_freq_nav_all.reset_index(drop=True, inplace=True)
df_morp_root_freq_nav_all

Unnamed: 0,root,root_arabic,frequency,nav
0,Alh,اله,2851,"1:1:2, 1:2:2, 2:7:2, 2:8:6, 2:9:2, 2:10:5, 2:1..."
1,qwl,قول,1722,"2:8:4, 2:11:2, 2:11:8, 2:13:2, 2:13:8, 2:14:5,..."
2,kwn,كون,1390,"2:10:11, 2:16:10, 2:23:2, 2:23:19, 2:28:4, 2:3..."
3,rbb,ربب,980,"1:2:3, 2:5:5, 2:21:4, 2:26:19, 2:30:3, 2:37:4,..."
4,Amn,امن,879,"2:3:2, 2:4:2, 2:6:11, 2:8:5, 2:8:11, 2:9:4, 2:..."
...,...,...,...,...
1637,HSHS,حصحص,1,12:51:21
1638,kft,كفت,1,77:25:4
1639,rmd,رمد,1,14:18:6
1640,rmH,رمح,1,5:94:11


In [29]:
df_morp_root_freq_nav_all.to_excel("Arabic_Buckwalter_Root_Freq_And_Nav_Morphology.xlsx", sheet_name="Arabic_Buckwalter_Root_Freq_Nav", index=False)