In [1]:
import pandas as pd

In [2]:
oa = pd.read_csv('../../OnomasticonArabicum2020/oa2020/ono_gazetteer.csv')

Some elements appear in multiple categories - only include elem with its most frequent category

In [3]:
oa.sort_values(by=['COUNT'], ascending=False, inplace=True)
oa.drop_duplicates(subset=['TEXT'], inplace=True)

In [4]:
prefixes = ['ف', 'و']
suffixes = ['ه', 'ت', 'ها']

In [5]:
exp = oa.loc[(oa['CATEGORY'] == 'EXP') & (oa['COUNT'] > 2), 'TEXT'].to_list()
exp.extend(['اسمه', 'ثم', 'يعني', 'من أهل ', 'أهل ال', 'والد', 'الأصل', 'أصله من', 'كان من', 'الماضي', 'الآتي', 'نسبة', 'يعرف كسلفه ب', 'يعرف ب', 'تعرف ب', 'نزيل', 'كان يسكن ب', 'زوج'])
exp_prefixed = [prefix + elem for prefix in prefixes for elem in exp]
exp_prefixed.extend(exp)
og_exp = [elem[:-1] if elem[-2:] == 'به' else elem for elem in exp_prefixed]

In [6]:
og_ism = oa.loc[(oa['CATEGORY'] == 'ISM') & (oa['COUNT'] > 2), 'TEXT'].to_list()
og_ism.extend(['عبد شمس', 'عبد الرحمن', 'ألف'])
og_kun = oa.loc[(oa['CATEGORY'] == 'KUN') & (oa['COUNT'] > 2), 'TEXT'].to_list()
og_nsb = oa.loc[(oa['CATEGORY'] == 'NSB') & (oa['COUNT'] > 2), 'TEXT'].to_list()
og_swm = oa.loc[(oa['CATEGORY'] == 'SWM') & (oa['COUNT'] > 2), 'TEXT'].to_list()

In [7]:
laq = oa.loc[(oa['CATEGORY'] == 'LAQ') & (oa['COUNT'] > 2), 'TEXT'].to_list()
laq.extend(['ال' + l.split()[0] for l in laq if ' الدين' in l])
og_laq = list(set(laq))

In [8]:
end = ['ولد', 'روى', 'عن', 'يروي', 'تروي', 'سمع', 'حدث', 'قرأ', 'أخذ', 'حكى', 'أخبرنا', 'تفق', 'ذكر', 'وفد',
      'قد', 'توفي', 'مات', 'مولد', 'أجاز', 'أنبأنا', 'قدم']
end_prefixed = [prefix + elem for prefix in prefixes for elem in end]
end_prefixed.extend(end)
end_prefixed.extend([elem + suffix for suffix in suffixes for elem in end])
og_end = end_prefixed + end + ['NASAB']

In [9]:
data = [('END', elem) for elem in og_end]
data.extend([('EXP', elem) for elem in og_exp])
data.extend([('ISM', elem) for elem in og_ism])
data.extend([('KUN', elem) for elem in og_kun])
data.extend([('LAQ', elem) for elem in og_laq])
data.extend([('NSB', elem) for elem in og_nsb])
# Female forms of NSB
data.extend([('NSB', elem + 'ة') for elem in og_nsb if elem.endswith('ي')])
data.extend([('SWM', elem) for elem in og_swm])

In [10]:
def get_len(value):
    return len(value.split())

In [11]:
og_df = pd.DataFrame(data, columns=['CATEGORY', 'VALUE'])
og_df.drop_duplicates(inplace=True)
og_df['NGRAM'] = og_df['VALUE'].apply(get_len)
og_df.to_csv('../eis1600/gazetteers/data/onomastic_gazetteer.csv', index=False)
og_df.to_csv('../../OnomasticonArabicum2020/oa2020/onomastic_gazetteer.csv', index=False)