In [1]:
import json
import pandas as pd
import numpy as np
import ast
import spacy
import random
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Load Spacy
nlp = spacy.load("en_core_web_sm")

# Initiate fixed random seed
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

set_seed(42)

In [3]:
# Load medmentions data
df = pd.read_csv('/content/drive/MyDrive/analytics/fewshot_medical/medmentions.csv')
df

Unnamed: 0,pmid,passages,entities
0,25763772,"[{'type': 'title', 'text': ['DCTN4 as a modifi...","[{'text': ['DCTN4'], 'offsets': [[0, 5]], 'con..."
1,26316050,"[{'type': 'title', 'text': ['Prevascularized s...","[{'text': ['Prevascularized'], 'offsets': [[0,..."
2,26406200,"[{'type': 'title', 'text': ['Seated maximum fl...","[{'text': ['Seated'], 'offsets': [[0, 6]], 'co..."
3,26424709,"[{'type': 'title', 'text': ['The Relationship ...","[{'text': ['Relationship'], 'offsets': [[4, 16..."
4,26476440,"[{'type': 'title', 'text': ['Promoting lifesty...","[{'text': ['Promoting'], 'offsets': [[0, 9]], ..."
...,...,...,...
4387,28545217,"[{'type': 'title', 'text': ['A miniature bird ...","[{'text': ['bird'], 'offsets': [[12, 16]], 'co..."
4388,28545437,"[{'type': 'title', 'text': ['Post endodontic p...","[{'text': ['Post'], 'offsets': [[0, 4]], 'conc..."
4389,28546554,"[{'type': 'title', 'text': ['High throughput r...","[{'text': ['resistance'], 'offsets': [[16, 26]..."
4390,28549399,"[{'type': 'title', 'text': ['A systematic revi...","[{'text': ['systematic review'], 'offsets': [[..."


In [4]:
# Load type id index
type_id_map_df = pd.read_csv('/content/drive/MyDrive/analytics/fewshot_medical/semantic_type_ids.csv')
type_id_map = {}
for i in range(len(type_id_map_df)):
    type_id_map[type_id_map_df.loc[i, 'TypeID']] = type_id_map_df.loc[i, 'TypeName'].strip()

# Category integration rule
# Rule 1: If level is deeper than 3, integrate to upper category
# Rule 2: If the category has less than 100 samples, than integrate to upper category regardless of the level
# Rule 3: Continue rule 1, 2 until there is no sub category less than 100 samples, and delete the utmost category if it has less than 50 samples

# Convert type id into name
def convert_type_system(type_id_list):
    new_type_list = []
    for type_id in type_id_list:
        if type_id_map[type_id] != 'for_delete':
            new_type_list.append(type_id_map[type_id])
    new_type_list = list(set(new_type_list))

    return new_type_list

In [None]:
# Hierarchial tagging for multiple semantic tag named entities. Hierarchy is based on coocurrence frequency based pagerank centrality
multiple_semantic_type_id_set = []
node_set = []

for i in range(len(df)):
    named_entities = ast.literal_eval(df.loc[i, 'entities'])
    for named_entity in named_entities:
        semtantic_type_id_list = convert_type_system(named_entity['semantic_type_id'])
        if len(semtantic_type_id_list) > 1 and semtantic_type_id_list not in multiple_semantic_type_id_set:
            multiple_semantic_type_id_set.append(semtantic_type_id_list)
            for semantic_type_id in semtantic_type_id_list:
                node_set.append(semantic_type_id)

node_set = list(set(node_set))
node_set.sort()

matrix = pd.DataFrame(0, index=node_set, columns=node_set)
for multiple_semantic_id in multiple_semantic_type_id_set:
    for i in range(len(multiple_semantic_id)):
        for j in range(len(multiple_semantic_id)):
            if i != j:
                matrix.loc[multiple_semantic_id[i], multiple_semantic_id[j]] += 1

graph = nx.DiGraph()
for row in matrix.index:
    for col in matrix.columns:
        if matrix.loc[row, col] > 0:
            graph.add_edge(row, col, weight=matrix.loc[row, col])

pagerank = nx.pagerank(graph)
hierarchy = dict(sorted(pagerank.items(), key=lambda item: item[1], reverse=True))

hierarchy_table = pd.DataFrame(columns=['semantic_type_id', 'pagerank'])
for key, value in hierarchy.items():
    hierarchy_table = pd.concat([hierarchy_table, pd.DataFrame({'semantic_type_id': [key], 'pagerank': [value]})], ignore_index=True)
hierarchy_table['rank'] = hierarchy_table['pagerank'].rank(ascending=False, method='min')

hierarchy_table

In [None]:
# Make answer sheet, based on hierarchial tagging
named_entity_to_type_id = {}

for i in tqdm(range(len(df))):
    # Process multiple semantic type entity, using hierarchy
    named_entities = ast.literal_eval(df.loc[i, 'entities'])
    for named_entity in named_entities:
        semantic_type_id_list = convert_type_system(named_entity['semantic_type_id'])
        if len(semantic_type_id_list) == 0:
            continue
        named_entity_tokens = nlp(named_entity['text'][0])
        named_ent = ' '.join([token.text for token in named_entity_tokens]).lower()
        if named_ent == "" or named_ent ==  " ":
            continue
        if len(semantic_type_id_list) == 1:
            named_entity_to_type_id[named_ent] = semantic_type_id_list[0]
        if len(semantic_type_id_list) > 1:
            # Compare the value of hierarchy of each element, and left only one semantic type id of which the hierarchy is the highest
            # If there is multiple items with same hierarchy, then randomly select
            dump_or_not = {}
            for semantic_type in semantic_type_id_list:
                dump_or_not[semantic_type] = False
            highest_hierarchy = 0
            for semantic_type in semantic_type_id_list:
                if hierarchy[semantic_type] >= highest_hierarchy:
                    highest_hierarchy = hierarchy[semantic_type]
            for semantic_type in semantic_type_id_list:
                if hierarchy[semantic_type] < highest_hierarchy:
                    dump_or_not[semantic_type] = True
            if True in dump_or_not.values():
                for semantic_type in semantic_type_id_list:
                    if dump_or_not[semantic_type]:
                        semantic_type_id_list.remove(semantic_type)

            # If the length of semantic_type_id is still larger than 1, then randomly select one
            if len(semantic_type_id_list) > 1:
                random_item = random.choice(semantic_type_id_list)
                named_entity_to_type_id[named_ent] = random_item
            else:
                named_entity_to_type_id[named_ent] = semantic_type_id_list[0]

named_entity_frequency = {}
for key, value in named_entity_to_type_id.items():
    if value in named_entity_frequency.keys():
        named_entity_frequency[value] += 1
    else:
        named_entity_frequency[value] = 1

In [None]:
# Check frequency per every named entity type
named_entity_type_frequency = {}
for key, value in named_entity_to_type_id.items():
    if value in named_entity_type_frequency.keys():
        named_entity_type_frequency[value] += 1
    else:
        named_entity_type_frequency[value] = 1

named_entity_frequency_df = pd.DataFrame(columns=['semantic_type', 'frequency'])
for key, value in named_entity_type_frequency.items():
    named_entity_frequency_df = pd.concat([named_entity_frequency_df, pd.DataFrame({'semantic_type': [key], 'frequency': [value]})], ignore_index=True)

# Sort named_entity_frequency_df by frequency, except that row 0 should be a row with value 'UnknownType' in column 'semantic_type'
named_entity_frequency_df = named_entity_frequency_df.sort_values(by=['frequency'], ascending=False)
target_row = named_entity_frequency_df[named_entity_frequency_df['semantic_type'] == 'UnknownType']
remaining_rows = named_entity_frequency_df[named_entity_frequency_df['semantic_type'] != 'UnknownType']
named_entity_frequency_df = pd.concat([target_row, remaining_rows])
named_entity_frequency_df = named_entity_frequency_df.reset_index(drop=True)

named_entity_frequency_df.to_csv('/content/drive/MyDrive/analytics/fewshot_medical/named_entity_type_frequency.csv', index=False)
named_entity_frequency_df

In [None]:
type_ids = ['UnknownType']
type_ids += named_entity_frequency_df['semantic_type'].tolist()
type_ids = list(set(type_ids))
type_ids

In [None]:
# Decide maximum token span
named_entity_len = []
for key in named_entity_to_type_id.keys():
    named_entity_len.append(len(key.split(' ')))

plt.hist(named_entity_len, bins=range(min(named_entity_len), max(named_entity_len) + 2, 1), edgecolor='black')
plt.xlabel('Span of Named Entity')
plt.ylabel('Frequency')
plt.title('Histogram of Span of Named Entity')
plt.show()

# 10 would be appropriate

In [None]:
# Generate text span
max_span = 10
generated_text_span = []
for i in tqdm(range(len(df))):
    text = ast.literal_eval(df.loc[i, 'passages'])[0]['text'][0]
    abstract = ast.literal_eval(df.loc[i, 'passages'])[1]['text'][0]

    doc = nlp(text)
    sentences = [sent for sent in doc.sents]

    for span_len in range(1, max_span+1):
        for sentence in sentences:
            tokens = [token.text for token in sentence]
            for i in range(len(tokens)+1-span_len):
                token_span_for_add = ' '.join(tokens[i:i+span_len])
                generated_text_span.append(token_span_for_add)

    doc = nlp(abstract)
    sentences = [sent for sent in doc.sents]

    for span_len in range(1, max_span+1):
        for sentence in sentences:
            tokens = [token.text for token in sentence]
            for i in range(len(tokens)+1-span_len):
                token_span_for_add = ' '.join(tokens[i:i+span_len])
                generated_text_span.append(token_span_for_add)
generated_text_span = list(set(generated_text_span))
generated_text_span.sort()

In [None]:
with open('/content/drive/MyDrive/analytics/fewshot_medical/generated_text_span.json', 'w') as f:
    json.dump(generated_text_span, f)
with open('/content/drive/MyDrive/analytics/fewshot_medical/type_ids.json', 'w') as f:
    json.dump(type_ids, f)
with open('/content/drive/MyDrive/analytics/fewshot_medical/named_entity_to_type_id.json', 'w') as f:
    json.dump(named_entity_to_type_id, f)

In [None]:
from google.colab import runtime

runtime.unassign()