In [None]:
import json
import pandas as pd
import numpy as np
import ast
import spacy
import re
from collections import deque
import random
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import copy

from google.colab import drive
drive.mount('/content/drive')

In [None]:
dir = ''

In [None]:
# Load Spacy
nlp = spacy.load("en_core_web_sm")

# Initiate fixed random seed
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

set_seed(42)

In [None]:
# Load medmentions data
df = pd.read_csv(f'{dir}/medmentions.csv')
df

Unnamed: 0,pmid,passages,entities
0,25763772,"[{'type': 'title', 'text': ['DCTN4 as a modifi...","[{'text': ['DCTN4'], 'offsets': [[0, 5]], 'con..."
1,26316050,"[{'type': 'title', 'text': ['Prevascularized s...","[{'text': ['Prevascularized'], 'offsets': [[0,..."
2,26406200,"[{'type': 'title', 'text': ['Seated maximum fl...","[{'text': ['Seated'], 'offsets': [[0, 6]], 'co..."
3,26424709,"[{'type': 'title', 'text': ['The Relationship ...","[{'text': ['Relationship'], 'offsets': [[4, 16..."
4,26476440,"[{'type': 'title', 'text': ['Promoting lifesty...","[{'text': ['Promoting'], 'offsets': [[0, 9]], ..."
...,...,...,...
4387,28545217,"[{'type': 'title', 'text': ['A miniature bird ...","[{'text': ['bird'], 'offsets': [[12, 16]], 'co..."
4388,28545437,"[{'type': 'title', 'text': ['Post endodontic p...","[{'text': ['Post'], 'offsets': [[0, 4]], 'conc..."
4389,28546554,"[{'type': 'title', 'text': ['High throughput r...","[{'text': ['resistance'], 'offsets': [[16, 26]..."
4390,28549399,"[{'type': 'title', 'text': ['A systematic revi...","[{'text': ['systematic review'], 'offsets': [[..."


In [None]:
total_entity_num = 0
token_length_frequency = {}

for i in tqdm(range(len(df))):
    named_entities = ast.literal_eval(df.loc[i, 'entities'])
    for named_entity in named_entities:
        total_entity_num += 1

        # Preprocess for named entity
        text = named_entity['text'][0]
        text = re.sub(r'\([^)]*\)', ' ', text)
        text = re.sub(r'\[[^\]]*\]', ' ', text)
        text = re.sub(r'\{[^\}]*\}', ' ', text)
        text = re.sub(r'\<[^>]*\>', ' ', text)
        text = text.replace('-', ' ')
        text = re.sub(r'\s+', ' ', text)

        token_doc = nlp(text)
        token_len = len([token.text for token in token_doc])
        try:
            token_length_frequency[token_len] += 1
        except:
            token_length_frequency[token_len] = 1

# Make cumulative relative frequency distribution
for key, value in token_length_frequency.items():
    token_length_frequency[key] = value/total_entity_num

token_length_cumulative_relative_frequency = {}
cumulative_frequency = 0
for key, value in sorted(token_length_frequency.items(), key=lambda item: item[0]):
    cumulative_frequency += value
    token_length_cumulative_relative_frequency[key] = cumulative_frequency

token_length_cumulative_relative_frequency

In [None]:
# Category integration rule
# Rule 1: If level is deeper than 3 (or 4), integrate to upper category
# Rule 2: If the category has less than 50 samples, than integrate to upper category regardless of the level
# Rule 3: Continue rule 1, 2 until there is no sub category less than 100 samples, and delete the utmost category if it has less than 50 samples
# Rule 4: If the utmost category has less than 100 samples, delete it

class_num = 19
# class_num = 50

# Load type id index
type_id_map_df = pd.read_csv(f'{dir}/semantic_type_ids_{class_num}_way.csv')
type_id_map = {}
for i in range(len(type_id_map_df)):
    type_id_map[type_id_map_df.loc[i, 'TypeID']] = type_id_map_df.loc[i, 'TypeName'].strip()

# Convert type id into name
def convert_type_system(type_id_list):
    new_type_list = []
    for type_id in type_id_list:
        if type_id_map[type_id] != 'for_delete':
            new_type_list.append(type_id_map[type_id])
    new_type_list = list(set(new_type_list))

    return new_type_list

In [None]:
# Hierarchial tagging for multiple semantic tag named entities. Hierarchy is based on coocurrence frequency based pagerank centrality
multiple_semantic_type_id_set = []
node_set = []

for i in range(len(df)):
    named_entities = ast.literal_eval(df.loc[i, 'entities'])
    for named_entity in named_entities:
        semtantic_type_id_list = convert_type_system(named_entity['semantic_type_id'])
        if len(semtantic_type_id_list) > 1 and semtantic_type_id_list not in multiple_semantic_type_id_set:
            multiple_semantic_type_id_set.append(semtantic_type_id_list)
            for semantic_type_id in semtantic_type_id_list:
                node_set.append(semantic_type_id)

node_set = list(set(node_set))
node_set.sort()

matrix = pd.DataFrame(0, index=node_set, columns=node_set)
for multiple_semantic_id in multiple_semantic_type_id_set:
    for i in range(len(multiple_semantic_id)):
        for j in range(len(multiple_semantic_id)):
            if i != j:
                matrix.loc[multiple_semantic_id[i], multiple_semantic_id[j]] += 1

graph = nx.DiGraph()
for row in matrix.index:
    for col in matrix.columns:
        if matrix.loc[row, col] > 0:
            graph.add_edge(row, col, weight=matrix.loc[row, col])

pagerank = nx.pagerank(graph)
hierarchy = dict(sorted(pagerank.items(), key=lambda item: item[1], reverse=True))

hierarchy_table = pd.DataFrame(columns=['semantic_type_id', 'pagerank'])
for key, value in hierarchy.items():
    hierarchy_table = pd.concat([hierarchy_table, pd.DataFrame({'semantic_type_id': [key], 'pagerank': [value]})], ignore_index=True)
hierarchy_table['rank'] = hierarchy_table['pagerank'].rank(ascending=False, method='min').astype(int)

hierarchy_table

  hierarchy_table = pd.concat([hierarchy_table, pd.DataFrame({'semantic_type_id': [key], 'pagerank': [value]})], ignore_index=True)


Unnamed: 0,semantic_type_id,pagerank,rank
0,Substance,0.156045,1
1,Occupational Activity,0.12459,2
2,Manufactured Object,0.12456,3
3,Intellectual Product,0.118775,4
4,Idea or Concept,0.117113,5
5,Natural Phenomenon or Process,0.087571,6
6,Anatomical Structure,0.085376,7
7,Phenomenon or Process,0.085376,7
8,Activity,0.050301,9
9,Organization,0.050292,10


In [None]:
type_id_samples_list = {}

for i in tqdm(range(len(df))):
    try:
        title = ast.literal_eval(df.loc[i, 'passages'])[0]['text'][0]
        body = ast.literal_eval(df.loc[i, 'passages'])[1]['text'][0]
        if title.strip()[-1] != '.':
            title = title.strip() + '.'

        # Delete all texts in (), [], {}, or <>
        title = re.sub(r'\([^)]*\)', ' ', title)
        title = re.sub(r'\[[^\]]*\]', ' ', title)
        title = re.sub(r'\{[^\}]*\}', ' ', title)
        title = re.sub(r'\<[^>]*\>', ' ', title)
        title = title.replace('-', ' ')
        title = re.sub(r'\s+', ' ', title)

        body = re.sub(r'\([^)]*\)', ' ', body)
        body = re.sub(r'\[[^\]]*\]', ' ', body)
        body = re.sub(r'\{[^\}]*\}', ' ', body)
        body = re.sub(r'\<[^>]*\>', ' ', body)
        body = body.replace('-', ' ')
        body = re.sub(r'\s+', ' ', body)

        named_entity_pairs = []

        named_entities = ast.literal_eval(df.loc[i, 'entities'])
        for named_entity in named_entities:
            # Preprocess for named entity
            text = named_entity['text'][0]
            text = re.sub(r'\([^)]*\)', ' ', text)
            text = re.sub(r'\[[^\]]*\]', ' ', text)
            text = re.sub(r'\{[^\}]*\}', ' ', text)
            text = re.sub(r'\<[^>]*\>', ' ', text)
            text = text.replace('-', ' ')
            text = re.sub(r'\s+', ' ', text)

            token_doc = nlp(text)
            processed_text = ' '.join([token.text for token in token_doc])

            type_id = ''
            semantic_type_id_list = convert_type_system(named_entity['semantic_type_id'])

            if len(semantic_type_id_list) == 0:
                continue
            if len(semantic_type_id_list) == 1:
                type_id = semantic_type_id_list[0]
            if len(semantic_type_id_list) > 1:
                # Compare the value of hierarchy of each element, and left only one semantic type id of which the hierarchy is the highest
                # If there is multiple items with same hierarchy, then randomly select
                dump_or_not = {}
                for semantic_type in semantic_type_id_list:
                    dump_or_not[semantic_type] = False
                highest_hierarchy = 0
                for semantic_type in semantic_type_id_list:
                    if hierarchy[semantic_type] >= highest_hierarchy:
                        highest_hierarchy = hierarchy[semantic_type]
                for semantic_type in semantic_type_id_list:
                    if hierarchy[semantic_type] < highest_hierarchy:
                        dump_or_not[semantic_type] = True
                if True in dump_or_not.values():
                    for semantic_type in semantic_type_id_list:
                        if dump_or_not[semantic_type]:
                            semantic_type_id_list.remove(semantic_type)

                if len(semantic_type_id_list) > 1:
                    random_item = random.choice(semantic_type_id_list)
                    type_id = random_item
                else:
                    type_id = semantic_type_id_list[0]

            if type_id == '':
                continue

            is_contained = False
            for named_entity_pair in named_entity_pairs:
                if named_entity_pair[0] == processed_text and named_entity_pair[1] == type_id:
                    is_contained = True

            if not is_contained:
                named_entity_pairs.append([processed_text, type_id])

        spans_mark_pairs = []

        doc = nlp(title)
        for sent in doc.sents:
            tokens = [token.text for token in sent]
            for span_len in range(1, 9):
                try:
                    spans_per_len = []
                    for j in range(int(len(tokens))-span_len+1):
                        span_joined = ' '.join(tokens[j:j+span_len])
                        rest_of_them = tokens[:j]+["[MARK_POSITION]"]+tokens[j+span_len:]
                        spans_mark_pairs.append([rest_of_them, span_joined])
                except:
                    continue

        doc = nlp(body)
        for sent in doc.sents:
            tokens = [token.text for token in sent]
            for span_len in range(1, 9):
                try:
                    spans_per_len = []
                    for j in range(int(len(tokens))-span_len+1):
                        span_joined = ' '.join(tokens[j:j+span_len])
                        rest_of_them = tokens[:j]+["[MARK_POSITION]"]+tokens[i+span_len:]
                        spans_mark_pairs.append([rest_of_them, span_joined])
                except:
                    continue

        not_unknown = []

        for named_entity_pair in named_entity_pairs:
            for spans_mark_pair in spans_mark_pairs:
                if spans_mark_pair[1] == named_entity_pair[0]:
                    try:
                        type_id_samples_list[named_entity_pair[1]].append(spans_mark_pair)
                    except:
                        type_id_samples_list[named_entity_pair[1]] = [spans_mark_pair]
                    not_unknown.append(spans_mark_pair)

        unknowns = []
        for spans_mark_pair in spans_mark_pairs:
            if spans_mark_pair not in not_unknown:
                unknowns.append(spans_mark_pair)

        try:
            type_id_samples_list['UnknownType'].extend(unknowns)
        except:
            type_id_samples_list['UnknownType'] = unknowns

    except:
        print(f'Error at index {i}')
        continue

100%|██████████| 4392/4392 [24:10<00:00,  3.03it/s]


In [None]:
# Frequency of type id of span candidates
type_id_frequency = {}
for key, value in type_id_samples_list.items():
    type_id_frequency[key] = len(value)
type_id_frequency_df = pd.DataFrame(columns=['semantic_type', 'frequency'])
for key, value in type_id_frequency.items():
    type_id_frequency_df = pd.concat([type_id_frequency_df, pd.DataFrame({'semantic_type': [key], 'frequency': [value]})], ignore_index=True)

# Sort named_entity_frequency_df by frequency, except that row 0 should be a row with value 'UnknownType' in column 'semantic_type'
type_id_frequency_df = type_id_frequency_df.sort_values(by=['frequency'], ascending=False)
target_row = type_id_frequency_df[type_id_frequency_df['semantic_type'] == 'UnknownType']
remaining_rows = type_id_frequency_df[type_id_frequency_df['semantic_type'] != 'UnknownType']
type_id_frequency_df = pd.concat([target_row, remaining_rows])
type_id_frequency_df = type_id_frequency_df.reset_index(drop=True)

type_id_frequency_df

Unnamed: 0,semantic_type,frequency
0,UnknownType,6947708
1,Idea or Concept,112603
2,Natural Phenomenon or Process,51014
3,Substance,44679
4,Occupational Activity,40945
5,Anatomical Structure,24858
6,Group,19616
7,Finding,19594
8,Organism,14160
9,Intellectual Product,11092


In [None]:
# Legend of type ids
type_ids = list(set(type_id_frequency_df['semantic_type'].tolist()))
type_ids.sort()

type_ids

['Activity',
 'Anatomical Structure',
 'Behavior',
 'Conceptual Entity',
 'Finding',
 'Group',
 'Idea or Concept',
 'Injury or Poisoning',
 'Intellectual Product',
 'Manufactured Object',
 'Natural Phenomenon or Process',
 'Occupation or Discipline',
 'Occupational Activity',
 'Organism',
 'Organism Attribute',
 'Organization',
 'Phenomenon or Process',
 'Substance',
 'UnknownType']

In [None]:
type_id_frequency_df.to_csv(f'{dir}/type_id_frequency.csv', index=False)
with open(f'{dir}/type_ids.json', 'w') as f:
    json.dump(type_ids, f)
with open(f'{dir}/type_id_samples_list.json', 'w') as f:
    json.dump(type_id_samples_list, f)
with open(f'{dir}/token_length_cumulative_relative_frequency.json', 'w') as f:
    json.dump(token_length_cumulative_relative_frequency, f)

In [None]:
# Train test split
task_sample_ratios = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3]
validation_sample_ratio = 0.1
for task_sample_ratio in task_sample_ratios:
    print(f'Task sample ratio: {task_sample_ratio}')
    task_set_type_id_list_of_samples = {}
    validation_set_type_id_list_of_samples = {}
    test_set_type_id_list_of_samples = {}

    for i in tqdm(range(len(type_ids))):
        task_sample_number = int((len(type_id_samples_list[type_ids[i]]))*task_sample_ratio)
        validation_sample_number = int((len(type_id_samples_list[type_ids[i]]))*validation_sample_ratio)
        shuffled = type_id_samples_list[type_ids[i]]
        random.shuffle(shuffled)
        task_set_type_id_list_of_samples[i] = shuffled[:task_sample_number]
        validation_set_type_id_list_of_samples[i] = shuffled[task_sample_number:task_sample_number+validation_sample_number]
        test_set_type_id_list_of_samples[i] = shuffled[task_sample_number+validation_sample_number:]

    # Save task_set_type_id_list_of_samples into json file
    with open(f'{dir}/train_test_ratio/{len(type_ids)}_way/{task_sample_ratio}/task_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(task_set_type_id_list_of_samples, f)

    # Save validation_set_type_id_list_of_samples into json file
    with open(f'{dir}/train_test_ratio/{len(type_ids)}_way/{task_sample_ratio}/validation_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(validation_set_type_id_list_of_samples, f)

    # Save test_set_type_id_list_of_samples into json file
    with open(f'{dir}/train_test_ratio/{len(type_ids)}_way/{task_sample_ratio}/test_set_type_id_list_of_samples.json', 'w') as f:
        json.dump(test_set_type_id_list_of_samples, f)

In [None]:
from google.colab import runtime

runtime.unassign()