# Gazetteer Maker

## Introduction

    The main goal of this code is to transform tables into tables of gazetteers for each ontology aligned with PropaPhen. Those gazetteers are going to be used in the "Detection" module to find the equivalent Named Entities.

In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Libraries

### Standard

In [70]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Globals

In [71]:
path_to_atoms = "data/tables/MRAUI.processed.csv"
path_to_concepts = "data/tables/MRCONSO.processed.csv"
path_to_semantics = "data/tables/SRDEF.processed.csv"
path_to_world = "data/tables/worldkg_nodes.csv"

## Tables to Gazetteers

In [72]:
df = pd.DataFrame(columns=['ID', 'Name'])

### UMLS

In [73]:
concept_df = pd.read_csv(path_to_concepts)

In [74]:
df = pd.concat([df, 
           concept_df[["CUI:ID","name"]].rename(columns={"CUI:ID": "ID", 
                                                         "name": "Name"})], axis=0,ignore_index=True)

In [75]:
atom_df = pd.read_csv(path_to_atoms)

In [76]:
df = pd.concat([df, 
           atom_df[["AUI:ID","name"]].rename(columns={"AUI:ID": "ID", 
                                                         "name": "Name"})], axis=0,ignore_index=True)

In [77]:
stype_df = pd.read_csv(path_to_semantics)

In [78]:
df = pd.concat([df, 
           stype_df[["STY:ID","name"]].rename(columns={"STY:ID": "ID", 
                                                         "name": "Name"})], axis=0,ignore_index=True)

In [84]:
df.to_csv('data/gazetteers/kbgazetteer.csv')

### WorldKG

In [79]:
world_df = pd.read_csv(path_to_world)

  world_df = pd.read_csv(path_to_world)


In [80]:
world_kg_id = "id:ID"
list_of_names = []
for k in world_df.columns:
    if "Name" in k or "name" in k:
        list_of_names.append(k)

In [81]:
world_df_filtered = world_df[[world_kg_id]+list_of_names]

In [82]:
world_df_filtered.shape

(1294509, 50)

In [83]:
world_df_final = pd.DataFrame(columns=['ID', 'Name'])
for _, row in tqdm(world_df_filtered.iterrows()):
    for nametype in list_of_names:
        if not pd.isna(row[nametype]):
            rowdataframe = pd.DataFrame({'ID': [row["id:ID"]],
                   'Name': [row[nametype]]})
            world_df_final = pd.concat([world_df_final, 
               rowdataframe], axis=0,ignore_index=True)

1294509it [20:13:23, 17.78it/s]


In [86]:
world_df_final.to_csv('data/gazetteers/world_gazetteer.csv')