# Weak-supervised learning models for relation extraction

__Goal:__ Train weak-supervised models, get accuracy level

__Method:__ Test of weak-supervised DL models:
1. Train DL models on CORD-19 dataset
2. Extract relations from papers that was used in test dataset
3. Convert relations to BEL format
4. Compare with relations from covid-19 dataset, calculate accuracy
5. Run error analysis


__Data:__ covid-19-kg dataset, [CORD-19 processed by CoronaWhy](https://console.cloud.google.com/storage/browser/coronawhy/NLPDatasets/)

__Tools:__ [PyTorch](https://pytorch.org/), [OpenNRE](https://github.com/thunlp/OpenNRE), [Snorkel](https://www.snorkel.org/) [PyBEL](https://github.com/pybel/pybel)

__Result:__ Trained weak-supervised models, accuracy of weak-supervised models

In [117]:
#!python3 --version
#!echo $PYTHONPATH
# Update PYTHONPATH, by setting <USERNAME> below.  This is to ensure access to OpenNRE frameworks and models
#!export PYTHONPATH=/home/<USERNAME>/local/lib/python:/home/<USERNAME>/OpenNRE:/usr/local/lib/python3.7/site-packages


In [115]:
#!pip install requests pybel pandas requests indra pybel[jupyter] spacy pyyaml
#python3 -m spacy download en-core-web-sm

In [116]:
import requests
import pybel
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import re
import yaml

import tqdm
from tqdm import tqdm # not sure why you need both

import os
import json
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

#from indra.processors import bel
from indra.sources import bel
from indra.util import batch_iter
from indra.sources import indra_db_rest

### Helper functions
These are copied from other task-vt notebooks (Protein Co-Occurrence)

In [4]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files


def get_all_files(dirname):
    all_files = []
    
    filenames = os.listdir(dirname)

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        all_files.append(file)
    
    return all_files

def get_cat_vocab(cat):
    df_cat = df[cat]
    items = df_cat.dropna().tolist()

    vocab_list = []

    for element in items:
        item = element.split(",")
        for e in item:
            vocab_list.append(e)
    
    c = collections.Counter()

    for word in vocab_list:
        c[word] += 1
        
    result_dic = dict(c)
    
    return result_dic



## Train CORD-19 neural relation extraction model 

 ###  Step 1 - Load and Pre-Process CORD-19 Annnotated Data
This dataset is taken from: https://github.com/SciBiteLabs/CORD19.

In [5]:
#load all files
# update rootpath to location of CORD19 data set
root_path='/mount_disk/CORD19/annotated-CORD-19/1.4/CORD19'
#path=os.path.join(root_path, 'benchmark/nyt10/nyt10_rel2id.json')))

dirs = [
    os.path.join(root_path, 'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/')
    #os.path.join(root_path, 'comm_use_subset/comm_use_subset/pdf_json/'),
    #os.path.join(root_path, 'custom_license/custom_license/pdf_json/'),
    #os.path.join(root_path, 'noncomm_use_subset/noncomm_use_subset/pdf_json/')
]

files_stack = []
for dir_ in dirs:
    files = get_all_files(dir_)
    files_stack.append(files)

100%|██████████| 1342/1342 [00:03<00:00, 343.38it/s]


In [6]:
#build list of entities types
# not sure if this blob of code is needed for neural relation extraction
c = collections.Counter()

cat_vocab = []

for files in tqdm(files_stack):
    for file in files:
        for block in file['body_text']:
            dict_file = block['termite_hits'].keys()
            for key in dict_file:
                cat_vocab.append(key)

for word in cat_vocab:
    c[word] += 1
   
vocab_list = (set(list(c.elements())))

100%|██████████| 1/1 [00:00<00:00, 28.52it/s]


In [102]:
#build dataframe: entity mentions by blocks ignoring hint count
features = []
for files in tqdm(files_stack):
    for file in files:
        paper_id = file['paper_id']
        
        i = 0
        sections = ['abstract', 'body_text']
        for section in sections:
            for block in file[section]:

                block_id = section + '_' + str(i)
                
                block_features = []
                block_features.append(paper_id)
                block_features.append(block_id)
                
                termite_hits = block['termite_hits']
                
                block_categories = termite_hits.keys()
                block_categories = list(block_categories)
                for cat in vocab_list:
        
                    if cat in block_categories:
                        cat_entities = []
                        for hit in termite_hits[cat]:
                            entity = hit.get('name')
                            if entity not in cat_entities:
                                cat_entities.append(entity)
                                
                        cat_entities = ",".join(cat_entities)

                    else:
                        cat_entities = None

                    block_features.append(cat_entities)

                features.append(block_features)
                i += 1


col_names = ['paper_id', 'block_id']
for cat in vocab_list:
    col_names.append(cat)
df = pd.DataFrame(features, columns=col_names)
df.head()

100%|██████████| 1/1 [00:00<00:00,  3.43it/s]


Unnamed: 0,paper_id,block_id,INDICATION,CVPROT,COUNTRY,HPO,GENE,SPECIES,GOONTOL,SARSCOV,DRUG
0,96ef1767754a53f792951ba1752440ae94e90c60,abstract_0,,,,,,"Lymphocytic choriomeningitis virus,Viruses",,,
1,96ef1767754a53f792951ba1752440ae94e90c60,body_text_1,,,,,interferon gamma,Viruses,"peptide binding,MHC protein binding,T cell rec...",,
2,50a217a2dacfe1364383ec8c681f64f2fd76dbe7,abstract_0,Coronavirus Infections,,"Korea, Republic of,Italy",,,,,Severe acute respiratory syndrome coronavirus ...,
3,50a217a2dacfe1364383ec8c681f64f2fd76dbe7,abstract_1,Coronavirus Infections,,,,,,,Severe acute respiratory syndrome coronavirus ...,
4,50a217a2dacfe1364383ec8c681f64f2fd76dbe7,abstract_2,Coronavirus Infections,,"Korea, Republic of,China,Italy",,,,,Severe acute respiratory syndrome coronavirus 2,


In [103]:
# explore block categories
#block_categories

#### Explore data that will be used for labelling

In [104]:
#file['body_text'][0]['text'] # ok - so each text is already in a block

### Step 2 - Load Covid19kg - manually annotated kg to get labelling data

This will be used for training data. 


In [105]:
#load graph pre-procesed by Charlie Hoyt: https://github.com/CoronaWhy/bel4corona/tree/master/data/covid19kg
url = 'https://github.com/CoronaWhy/bel4corona/raw/master/data/covid19kg/covid19-fraunhofer-grounded.bel.nodelink.json'
res = requests.get(url)
pybel_graph = pybel.from_nodelink(res.json())

# view graph in jupyter (not displaying)
#jupyter.to_html(pybel_graph)

In [106]:
#!pip install pybel-tools
import pybel_tools
from pybel_tools import summary

In [107]:
# returns a dict of key_val pairs 
edges=pybel_tools.summary.get_edge_relations(pybel_graph)

In [108]:
# Explore edges
#for key,value in edges.items():
#    print(key)
#    print(value)

In [109]:
# Explore relation types
relations_pybel=pybel.struct.summary.count_relations(pybel_graph)
relations=[]
for i in relations_pybel.keys():
    relations.append(i)

#### Load pre-processed covid19 frauenhofer manual annotations
We can actually use processed annotated data in the form of a dataframe for ease of use

In [110]:
# set the correct path location
pybel_pd=pd.read_csv('/home/<USRENAME>/covid19_frauenhofer_annotations.csv')

FileNotFoundError: [Errno 2] File b'/home/<USRENAME>/covid19_frauenhofer_annotations.csv' does not exist: b'/home/<USRENAME>/covid19_frauenhofer_annotations.csv'

In [25]:
# Explore head 
pybel_pd.head(2)

Unnamed: 0.1,Unnamed: 0,sentence,source,relation,target,link,pmc_id,doi_id
0,0,"While blocking TPC2 activity by tetrandrine, a...","{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...",negativeCorrelation,"{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,
1,1,Chemoinformatics searches yielded 15 approved ...,"{'(S)-verapamil': {'namespace': 'chebi', 'name...",negativeCorrelation,"{'hypertension': {'namespace': 'doid', 'name':...","{'annotations': {}, 'citation': {'db': 'DOI', ...",,https://doi.org/10.1101/2020.03.22.002386


### Step3 - Label CORD-19 data with snorkel for Weak Supervision
Create label functions here to label data heuristically

Can also look at distant supervision - that is using a knowledge base (e.g. covid19kg) to label new unseen examples in the corpus.

In [None]:
import snorkel

In [None]:
# Define labeling function.. 
from snorkel.labeling import labeling_function

@labeling_function()
def lf_keyword_my(x):
    """Many spam comments talk about 'my channel', 'my video', etc."""
    return SPAM if "my" in x.text.lower() else ABSTAIN

### Step 4 - Train a Classifer with openNRE


For training
- need to have train,test, validate data (see examples in openNRE/pretrain/wiki80 for format)
- need to use a LM for encoding e.g. BERT of glove 
- need to select model (e.g. CNN)

See example of model training at:

https://github.com/thunlp/OpenNRE/blob/master/example/train_wiki80_bert_softmax.py 

To do:
(1) Get pybel dataset into format of openNRE training dataset (use wiki_train.txt as an example)

(2) Decide which model (e.g. CNN) and sentence embedding LM (e.g. BERT) we will use to train the openNRE classifier. 


Note - start with simple model e.g. CNN, later, we can  try Few Shot Classifier with Meta-Learner, to deal with small amount of labelled data.

https://www.aclweb.org/anthology/D18-1514.pdf

https://github.com/ProKil/FewRel/blob/master/train_demo.py

In [90]:
import opennre

INFO: [2020-06-13 12:24:27] transformers.file_utils - PyTorch version 1.3.0 available.


In [91]:
# Example - check example works
model = opennre.get_model('wiki80_cnn_softmax')
model.infer({'text': 'He was the son of Máel Dúin mac Máele Fithrich, and grandson of the high king Áed Uaridnach (died 612).', 'h': {'pos': (18, 46)}, 't': {'pos': (78, 91)}})

INFO: [2020-06-13 12:26:07] root - Initializing word embedding with word2vec.


('father', 0.7500484585762024)

In [92]:
# Exploration of OpenNRE
# Explore wiki80 training data (to get an idea of training data format)
# Load the file
path='/home/<USERNAME>/.opennre/benchmark/wiki80/wiki80_train.txt'
f = open(path)
data = []
for line in f.readlines():
    line = line.rstrip()
    if len(line) > 0:
        data.append(eval(line))
      

FileNotFoundError: [Errno 2] No such file or directory: '/home/<USERNAME>/.opennre/benchmark/wiki80/wiki80_train.txt'

In [86]:
# explore data
# identify keys in the dictionary required for OpenNRE training data set 
type(data[0]) # data is a list of dicts 
keys=[]
for key in data[0].keys():
    keys.append(key)

print(keys)

['token', 'h', 't', 'relation']


In [94]:
# Explore data
for value in data[0].values():
    print(type(value))

# token is a list of tokens
# h,t is a dict, with keys name,id, pos
# relation is a str

<class 'list'>
<class 'dict'>
<class 'dict'>
<class 'str'>


In [95]:
# Explore wiki_train_data as training_data
wiki_data=pd.DataFrame(data)
wiki_data.columns
wiki_data.shape
wiki_data.head()

Unnamed: 0,token,h,t,relation
0,"[Merpati, flight, 106, departed, Jakarta, (, C...","{'name': 'tjq', 'id': 'Q1331049', 'pos': [16, ...","{'name': 'tanjung pandan', 'id': 'Q3056359', '...",place served by transport hub
1,"[The, name, was, at, one, point, changed, to, ...","{'name': 'east midlands airport', 'id': 'Q8977...","{'name': 'nottingham', 'id': 'Q41262', 'pos': ...",place served by transport hub
2,"[It, is, a, four, -, level, stack, interchange...",{'name': 'fort lauderdale-hollywood internatio...,"{'name': 'fort lauderdale, florida', 'id': 'Q1...",place served by transport hub
3,"[It, is, the, main, alternate, of, Jinnah, Int...","{'name': 'jinnah international airport', 'id':...","{'name': 'karachi', 'id': 'Q8660', 'pos': [10,...",place served by transport hub
4,"[Nearby, airports, include, Akwa, Ibom, Airpor...",{'name': 'margaret ekpo international airport'...,"{'name': 'calabar', 'id': 'Q844091', 'pos': [1...",place served by transport hub


#### Convert pybel dataframe of relations to correct training data format for OpenNRE

In [31]:
pybel_pd.shape # 5236 training points
pybel_pd.head() # sentence, source, target, relation

Unnamed: 0.1,Unnamed: 0,sentence,source,relation,target,link,pmc_id,doi_id
0,0,"While blocking TPC2 activity by tetrandrine, a...","{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...",negativeCorrelation,"{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,
1,1,Chemoinformatics searches yielded 15 approved ...,"{'(S)-verapamil': {'namespace': 'chebi', 'name...",negativeCorrelation,"{'hypertension': {'namespace': 'doid', 'name':...","{'annotations': {}, 'citation': {'db': 'DOI', ...",,https://doi.org/10.1101/2020.03.22.002386
2,2,Thyroid stimulating hormone and free triiodoth...,"{""3,3',5'-triiodothyronine"": {'namespace': 'ch...",negativeCorrelation,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {'mesh': {'D044967': True}}, '...",32217556.0,
3,3,"Based on these results, we performed virtual d...","{""4'-epidoxorubicin"": {'namespace': 'chebi', '...",decreases,"{'3.4.22.69': {'namespace': 'eccode', 'name': ...","{'annotations': {}, 'citation': {'authors': ['...",32173287.0,
4,4,Doctors can also use a clinically approved bil...,{'4-methylumbelliferone': {'namespace': 'chebi...,decreases,"{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...","{'annotations': {'mesh': {'D008168': True}}, '...",32205856.0,


In [83]:
# sentence and source
pybel_pd['sentence'][1]
pybel_pd['source'][1]

"{'(S)-verapamil': {'namespace': 'chebi', 'name': '(S)-verapamil', 'identifier': '77736'}}"

In [111]:
#!pip install spacy
#!python3 -m spacy download en_core_web_sm # consider scispacy
# use spacy to process sentence in pybel_pd - to tokenize sentence and get details such as source/target name, and position in text 

import spacy
nlp = spacy.load("en_core_web_sm")

In [62]:
# keys are token, h, t, relation
# token is a list of str tokens (spacy sep?)
# h, t are both dicts with elements keys name,id, pos
# relation is a str of type "relation"

pairs=[]
#for i in range(len(pybel_pd)):
for i in range(3):

    row=pybel_pd.iloc[i]
    sentence=row['sentence']
    sentence_spacy=nlp(sentence)
    token_value=[tok.text for tok in sentence_spacy]
    for token in sentence_spacy:
        print(token.text, token.idx)
        
    pairs.append(token_value) # first element in pairs list
    
    # now get head
    # seem to be in 'chebi' namespace
    head_value_dict=row['source']
    print(head_value_dict)
    
    #json_acceptable_string = head_value_dict.replace("'", "\" ")
    #print(json_acceptable_string)
    #d = json.loads(json_acceptable_string) # json not working 
    
    d = yaml.load(head_value_dict,Loader=yaml.FullLoader)
    for value_dict in d.values():
        head_namespace=value_dict['namespace']
        head_name=value_dict['name']
        head_identifier=dict['identifier']
     
    keys_head_dict= ['name', 'id', 'pos']
    #vals_head_dict=[head_name_value, head_identifier, ]

    #print(head_value_dict)
    #print(d)
    #
    head_value_dict['namespace']
    #dlist = [{k: v} for k, v in zip(keys, pairs)]
    

While 0
blocking 6
TPC2 15
activity 20
by 29
tetrandrine 32
, 43
an 45
inhibitor 48
for 58
TPC237 62
, 68
decreased 70
entry 80
of 86
SARS 89
- 93
CoV-2 94
S 100
pseudovirions 102
( 116
Fig 117
. 120
3f 122
) 124
, 125
treatment 127
of 137
cells 140
with 146
130 151
, 154
a 156
TRPML1 158
inhibitor 165
, 174
had 176
no 180
effect 183
( 190
Supplementary 191
Fig 205
. 208
1 210
) 211
, 212
indicating 214
that 225
TPC2 230
, 234
not 236
TRPML1 240
, 246
is 248
important 251
for 261
SARS 265
- 269
CoV-2 270
entry 276
. 281
{'(+)-Tetrandrine': {'namespace': 'chebi', 'name': '(+)-Tetrandrine', 'identifier': '49'}}


TypeError: 'type' object is not subscriptable

In [112]:
# how to get tok.text if it matches sentence id.. 
print(head_name)
sentence_spacy
for tok in sentence_spacy:
    print(tok.text)
    print(head_name)
    if tok.text == head_name:
        a = tok.idx 

print(a)

(+)-Tetrandrine
While
(+)-Tetrandrine
blocking
(+)-Tetrandrine
TPC2
(+)-Tetrandrine
activity
(+)-Tetrandrine
by
(+)-Tetrandrine
tetrandrine
(+)-Tetrandrine
,
(+)-Tetrandrine
an
(+)-Tetrandrine
inhibitor
(+)-Tetrandrine
for
(+)-Tetrandrine
TPC237
(+)-Tetrandrine
,
(+)-Tetrandrine
decreased
(+)-Tetrandrine
entry
(+)-Tetrandrine
of
(+)-Tetrandrine
SARS
(+)-Tetrandrine
-
(+)-Tetrandrine
CoV-2
(+)-Tetrandrine
S
(+)-Tetrandrine
pseudovirions
(+)-Tetrandrine
(
(+)-Tetrandrine
Fig
(+)-Tetrandrine
.
(+)-Tetrandrine
3f
(+)-Tetrandrine
)
(+)-Tetrandrine
,
(+)-Tetrandrine
treatment
(+)-Tetrandrine
of
(+)-Tetrandrine
cells
(+)-Tetrandrine
with
(+)-Tetrandrine
130
(+)-Tetrandrine
,
(+)-Tetrandrine
a
(+)-Tetrandrine
TRPML1
(+)-Tetrandrine
inhibitor
(+)-Tetrandrine
,
(+)-Tetrandrine
had
(+)-Tetrandrine
no
(+)-Tetrandrine
effect
(+)-Tetrandrine
(
(+)-Tetrandrine
Supplementary
(+)-Tetrandrine
Fig
(+)-Tetrandrine
.
(+)-Tetrandrine
1
(+)-Tetrandrine
)
(+)-Tetrandrine
,
(+)-Tetrandrine
indicating
(+)-Tetra

NameError: name 'a' is not defined

### Example of OpenNRE model training
Our code will look something like this with the frameowork, model and training data customised. 

In [113]:

#https://github.com/thunlp/OpenNRE/blob/master/example/train_nyt10_pcnn_att.py
    
import sys, json, 
import torch
import os
import numpy as np
import opennre
from opennre import encoder, model, framework
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bag_size', type=int, default=0)
args = parser.parse_args()

# Some basic settings
root_path = '.'
if not os.path.exists('ckpt'):
    os.mkdir('ckpt')
ckpt = 'ckpt/nyt10_pcnn_att.pth.tar'

# Check data
opennre.download('nyt10', root_path=root_path)
opennre.download('glove', root_path=root_path)
rel2id = json.load(open(os.path.join(root_path, 'benchmark/nyt10/nyt10_rel2id.json')))
wordi2d = json.load(open(os.path.join(root_path, 'pretrain/glove/glove.6B.50d_word2id.json')))
word2vec = np.load(os.path.join(root_path, 'pretrain/glove/glove.6B.50d_mat.npy'))

# Define the sentence encoder
sentence_encoder = opennre.encoder.PCNNEncoder(
    token2id=wordi2d,
    max_length=120,
    word_size=50,
    position_size=5,
    hidden_size=230,
    blank_padding=True,
    kernel_size=3,
    padding_size=1,
    word2vec=word2vec,
    dropout=0.5
)

# Define the model
model = opennre.model.BagAttention(sentence_encoder, len(rel2id), rel2id)

# Define the whole training framework
framework = opennre.framework.BagRE(
    train_path='benchmark/nyt10/nyt10_train.txt',
    val_path='benchmark/nyt10/nyt10_val.txt',
    test_path='benchmark/nyt10/nyt10_test.txt',
    model=model,
    ckpt=ckpt,
    batch_size=160,
    max_epoch=60,
    lr=0.5,
    weight_decay=0,
    opt='sgd',
    bag_size=args.bag_size)

# Train the model
framework.train_model()

# Test the model
framework.load_state_dict(torch.load(ckpt)['state_dict'])
result = framework.eval_model(framework.test_loader)

# Print the result
print('AUC on test set: {}'.format(result['auc']))

usage: ipykernel_launcher.py [-h] [--bag_size BAG_SIZE]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/lani_lichtenstein/.local/share/jupyter/runtime/kernel-c4d2179c-7d7a-4863-83f8-b81292d6e989.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
