In [24]:
import re
from tqdm import tqdm

In [38]:
import torch

In [25]:
f = open('cl.obo.txt', 'r')

In [26]:
content = f.read()

In [27]:
print(len(content))

7091106


In [16]:
# Cutting the text to block
term_blocks = content.split('\n\n')

In [17]:
print(len(term_blocks))

10038


In [18]:
print(term_blocks[8])

[Term]
id: BFO:0000024
name: fiat object part
is_a: BFO:0000040 ! material entity


## Parsing the block to synonym pairs

In [19]:
def parse_block(term_block):
    if 'name:' not in term_block:
        return None, None
    lines = term_block.splitlines()
    name = None
    synonyms = []
    name_regx = 'name: (.*)'
    synonym_regx = '\"(.*)\" (EXACT|RELATED)'
    for line in lines:
        # Here try to parse the name row
        matching_result = re.search(name_regx, line)
        if matching_result is not None:
            name_new = matching_result.group(1)
            if name is None:
                name = name_new
            elif name_new != name:
                print('-' * 100)
                print('Multiple lines of name in same block!')
                print(term_block)
                if name_new not in synonyms:
                    print('add new name to synonyms')
                    synonyms.append(name_new)
            continue
        # Here try to parse the synonym row
        matching_result = re.search(synonym_regx, line)
        if matching_result is not None:
            synonym = matching_result.group(1)
            relation = matching_result.group(2)
            if not synonym:
                print('get empty string when parsing synonyms')
            if not relation:
                print('get empty string for relation')
            synonyms.append((synonym, relation))
    return name, synonyms

In [20]:
data_dict = {}
block_ignored = []
for term_block in term_blocks:
    name, synonym_list = parse_block(term_block)
    if name is None or synonym_list is None:
        print('-' * 100)
        print('parsing result is none, ignore this block')
        block_ignored.append(term_block)
        continue
    if name in data_dict:
        print('different blocks contains same data name')
    data_dict[name] = synonym_list

----------------------------------------------------------------------------------------------------
parsing result is none, ignore this block
----------------------------------------------------------------------------------------------------
Multiple lines of name in same block!
[Term]
id: GO:0005622
name: intracellular
name: intracellular anatomical structure
def: "A component of a cell contained within (but not including) the plasma membrane. In eukaryotes it includes the nucleus and cytoplasm." [ISBN:0198506732]
def: "The living contents of a cell; the matter contained within (but not including) the plasma membrane, usually taken to exclude large vacuoles and masses of secretory or ingested material. In eukaryotes it includes the nucleus and cytoplasm." [ISBN:0198506732]
synonym: "internal to cell" EXACT []
synonym: "intracellular" EXACT []
synonym: "nucleocytoplasm" RELATED [GOC:mah]
synonym: "protoplasm" EXACT []
synonym: "protoplast" RELATED [GOC:mah]
xref: Wikipedia:Intracellu

In [21]:
print(len(data_dict))
print(len(block_ignored))

10032
2


In [35]:
import spacy
import string

# Declare special word/character.
START_W = '<w>'
END_W = '</w>'
START_S = '<s>'
END_S = '</s>'
DICT_SW = '</dsw>'
EOL_TK = '»'
UNK = '</unk>'
PAD = '</pad>'
RESERVE_TKS = [START_W, END_W, START_S, END_S, DICT_SW, UNK, PAD, EOL_TK]

PUCT_SET = set(string.punctuation)
STOP_WORDS = set(['an', 'the', 'and', 'of', 'in', 'on', 'at',
                  'other', 'others', 'The', 'nos', 'NOS'])
STOP_TOKENS = STOP_WORDS | PUCT_SET

spacy_model = spacy.load("en_core_web_sm")


def preprocess_name(name):
    parsed_result = spacy_model(name, disable=['parser', 'tagger', 'ner'])
    tokens = [str(_) for _ in parsed_result]
    tokens = [_ for _ in tokens if _.strip() != '' and _.lower() not in STOP_TOKENS]
    new_name = ' '.join(tokens)
    return name.lower() if new_name == '' else new_name.lower()

In [36]:
all_words = set()
punct = '!(),'
for name, synos in data_dict.items():
    all_words.update(preprocess_name(name).split())
    for syno in synos:
        all_words.update(preprocess_name(syno[0]).split())

In [37]:
print(len(all_words))

6422


In [43]:
def load_pretrained_we(file_name, word_set=None):
    print('Read pretrained embeddings from {}.'.format(file_name))
    dim = None
    emb_dict = {}
    with open(file_name, encoding='utf8') as f:
        for line in tqdm(f):
            if len(line) < 20:
                continue

            k = line.find('\t')
            if k == -1:
                k = line.find(' ')
            assert k > -1

            line = line.strip()
            if word_set is None or line[:k] in word_set:
                values = line.split()
                if dim is None:
                    dim = len(values) - 1
                else:
                    assert len(values) == dim + 1
                w = values[0]
                if word_set is None or w in word_set:
                    v = [float(_) for _ in values[-dim:]]
                    emb_dict[w] = v
    return emb_dict

In [44]:
fn = './BioEmb/Emb_SGw.txt'
embed_dict = load_pretrained_we(fn, all_words)

0it [00:00, ?it/s]Read pretrained embeddings from ./BioEmb/Emb_SGw.txt.
1650582it [00:04, 331236.36it/s]


In [45]:
print(len(embed_dict))
print(len(all_words))

5723
6422


0.8911554033011523


In [53]:
for word in all_words:
    if not embed_dict.get(word):
        print(word)

tetraspanin-29
has_gene_template
gmlp
lamé
vestibulocochlearis
gr1-high
aromatische
dc.4
atomkern
ventrc
vesiculæ
cavitus
neurofibrarum
t.8sp24int
ciliarus
t(h)-22
multigutturalis
somited
interlobulary
cd11c+cd123-
fcer2a
branching_part_of
pharyngeae
retinachoroid
foramina@fr
precementoblast
erythoblast
hly9-beta
r3g1
germinosum
cuboideometatarsal
coelemic
premacula
musculari
develops_in
cd217-positive
systematis
kohlenstoff
deferen
lymporeticular
chalcogene
genicularum
mf.bm
atrabiliary
brustdruese
dn2b
v(pre)b
t.dp69+.th
preenamel
nkp46-positive
adventitia)(venae
coeruleu
macrophagocytus
chordablast
pret.etp.th
hidrogeno
ckr-7
purkinji
müller
beta7-high
colligens
develops_from_part_of
hypercoracocïde@fr
vdelta6.3-positive
metanephron
protein_coding
metales
haemopoietica
opistonephros
neuaral
cleidooccipital
nk.g2a+.sp
pilosebacea
granuloblast
adventitia)(arteriae
propulsus
lyt10
splen)(lien
nematoblast
periperhal
irx-6
ky411
parathyoid
atomo
in_lateral_side_of
atmungssystem
dc.103
mf

## Pickle the data and embedding used for training

In [60]:
import pickle
with open('embeddings.pickle','wb') as fe_data_file:
     pickle.dump(embed_dict, fe_data_file)

In [59]:
print(len(embed_dict))

5723


In [61]:
with open('embeddings.pickle','rb') as fe_data_file:
     dict_load_back = pickle.load(fe_data_file)

In [70]:
def isSub(dict1, dict2):
    for key, value in dict1.items():
        if dict2[key] != dict1[key]:
            return false
    return True

In [71]:
print(isSub(embed_dict, dict_load_back))

True


In [73]:
with open('data.pickle','wb') as fe_data_file:
     dict_load_back = pickle.dump(data_dict, fe_data_file)

In [6]:
a = [[1, 2], [3, 4], [5 ,6]]
print([number for l in a for number in l])

[1, 2, 3, 4, 5, 6]


In [21]:
import itertools
import random
def generate_data(syno_sets):
    name1_list = []
    name2_list = []
    label_list = []
    for syno_set in syno_sets:
        for name1, name2 in itertools.product(syno_set, repeat=2):
            name1_list.append(name1)
            name2_list.append(name2)
            label_list.append(1)
        non_synonym_sets = random.sample([ x for x in syno_sets if x != syno_set], 3)
        non_synonym_list = [phrase for phrases in non_synonym_sets for phrase in phrases]
        for name1, name2 in itertools.product(syno_set, non_synonym_list):
            name1_list.append(name1)
            name2_list.append(name2)
            label_list.append(0)    
    return {'name1': name1_list, 'name2': name2_list, 'label': label_list}


In [8]:
import pandas as pd

pd.DataFrame.from_dict(generate_data(a)).head()

Unnamed: 0,name1,name2,label
0,1,1,1
1,1,2,1
2,2,1,1
3,2,2,1
4,1,3,0


In [31]:
import pickle
from tqdm import tqdm

with open('data.pickle', 'rb') as data_file:
    data = pickle.load(data_file)

synonym_groups = []
for name, synonyms in tqdm(data.items()):
    group = [preprocess_name(synonym[0]) for synonym in synonyms]
    group.append(preprocess_name(name))
    synonym_groups.append(group)

df_data = pd.DataFrame.from_dict(generate_data(synonym_groups))

100%|██████████| 10032/10032 [00:00<00:00, 12201.28it/s]


In [32]:
len(df_data[df_data['label'] == 1])

228830

In [34]:
df_data.to_csv('pairwise_data.csv', index = False)

In [35]:
read_data = pd.read_csv('pairwise_data.csv')
read_data.head()

Unnamed: 0,name1,name2,label
0,continuant,continuant,1
1,continuant,promonocyte,0
2,continuant,acropodium wagner,0
3,continuant,digitopodium region,0
4,continuant,extraembryonic venous system,0


In [36]:
df_data.head()

Unnamed: 0,name1,name2,label
0,continuant,continuant,1
1,continuant,promonocyte,0
2,continuant,acropodium wagner,0
3,continuant,digitopodium region,0
4,continuant,extraembryonic venous system,0


In [2]:
from collections import Counter
counter = Counter()
counter.update(['first', 'last'])

In [3]:
print(counter)

Counter({'first': 1, 'last': 1})


In [4]:
counter.update(['second', 'third'])

In [5]:
print(counter)

Counter({'first': 1, 'last': 1, 'second': 1, 'third': 1})
