# Weak-supervised learning models for relation extraction

__Goal:__ Train weak-supervised models, get accuracy level

__Method:__ Test of weak-supervised DL models:
1. Train DL models on CORD-19 dataset
2. Extract relations from papers that was used in test dataset
3. Convert relations to BEL format
4. Compare with relations from covid-19 dataset, calculate accuracy
5. Run error analysis


__Data:__ covid-19-kg dataset, [CORD-19 processed by CoronaWhy](https://console.cloud.google.com/storage/browser/coronawhy/NLPDatasets/)

__Tools:__ [PyTorch](https://pytorch.org/), [OpenNRE](https://github.com/thunlp/OpenNRE), [Snorkel](https://www.snorkel.org/) [PyBEL](https://github.com/pybel/pybel)

__Result:__ Trained weak-supervised models, accuracy of weak-supervised models

In [None]:
#!python3 --version
#!echo $PYTHONPATH
# Update PYTHONPATH, by setting <USERNAME> below.  This is to ensure access to OpenNRE frameworks and models
#!export PYTHONPATH= /home/<USERNAME>/local/lib/python:/home/<USERNAME>/OpenNRE:/usr/local/lib/python3.7/site-packages

In [None]:
#!pip install requests pybel pandas requests

In [None]:
import requests
import pybel
import pandas as pd
import matplotlib.pyplot as plt

import re

import tqdm
from tqdm import tqdm # not sure why you need both

import os
import json
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns



### Helper functions
These are copied from other task-vt notebooks (Protein Co-Occurrence)

In [None]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files


def get_all_files(dirname):
    all_files = []
    
    filenames = os.listdir(dirname)

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        all_files.append(file)
    
    return all_files

def get_cat_vocab(cat):
    df_cat = df[cat]
    items = df_cat.dropna().tolist()

    vocab_list = []

    for element in items:
        item = element.split(",")
        for e in item:
            vocab_list.append(e)
    
    c = collections.Counter()

    for word in vocab_list:
        c[word] += 1
        
    result_dic = dict(c)
    
    return result_dic

#https://www.kaggle.com/rtatman/co-occurrence-matrix-plot-in-python
def df_co_occurrance(df, strain_group):
  strains_df = df.copy()  
  for i in strain_group:
        eval_match = df.SARS_COV.str.contains(i)
        strains_df[i] = eval_match
  return strains_df


## Train CORD-19 neural relation extraction model 

 ###  Step 1 - Load and Pre-Process CORD-19 Annnotated Data
This dataset is taken from: https://github.com/SciBiteLabs/CORD19.

In [None]:
#load all files
# update rootpath to location of CORD19 data set
root_path='/mount_disk/CORD19/annotated-CORD-19/1.4/CORD19'
#path=os.path.join(root_path, 'benchmark/nyt10/nyt10_rel2id.json')))

dirs = [
    os.path.join(root_path, 'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/'),
    os.path.join(root_path, 'comm_use_subset/comm_use_subset/pdf_json/'),
    os.path.join(root_path, 'custom_license/custom_license/pdf_json/'),
    os.path.join(root_path, 'noncomm_use_subset/noncomm_use_subset/pdf_json/')
]

files_stack = []
for dir_ in dirs:
    files = get_all_files(dir_)
    files_stack.append(files)

In [None]:
#build list of entities types
# not sure if this blob of code is needed for neural relation extraction
c = collections.Counter()

cat_vocab = []

for files in tqdm(files_stack):
    for file in files:
        for block in file['body_text']:
            dict_file = block['termite_hits'].keys()
            for key in dict_file:
                cat_vocab.append(key)

for word in cat_vocab:
    c[word] += 1
   
vocab_list = (set(list(c.elements())))

In [None]:
#build dataframe: entity mentions by blocks ignoring hint count
features = []
for files in tqdm(files_stack):
    for file in files:
        paper_id = file['paper_id']
        
        i = 0
        sections = ['abstract', 'body_text']
        for section in sections:
            for block in file[section]:

                block_id = section + '_' + str(i)
                
                block_features = []
                block_features.append(paper_id)
                block_features.append(block_id)
                
                termite_hits = block['termite_hits']
                
                block_categories = termite_hits.keys()
                block_categories = list(block_categories)
                for cat in vocab_list:
        
                    if cat in block_categories:
                        cat_entities = []
                        for hit in termite_hits[cat]:
                            entity = hit.get('name')
                            if entity not in cat_entities:
                                cat_entities.append(entity)
                                
                        cat_entities = ",".join(cat_entities)

                    else:
                        cat_entities = None

                    block_features.append(cat_entities)

                features.append(block_features)
                i += 1


col_names = ['paper_id', 'block_id']
for cat in vocab_list:
    col_names.append(cat)
df = pd.DataFrame(features, columns=col_names)
df.head()

#### Explore data that will be used for labelling

In [None]:
file['body_text'][1]['text'] # ok - so each text is already in a block

### Step 2 - Label CORD-19 data with snorkel for Weak Supervision
Create label functions here to label data heuristically


In [None]:
import snorkel

### Step3 - Train a Classifer with openNRE

For training
- need to have train,test, validate data (see examples in openNRE/pretrain/wiki80 for format)
- need to use a LM for encoding e.g. BERT of glove 
- need to set up the code.. (see example below) 

In [None]:
import opennre

In [None]:
# Example - check example works
model = opennre.get_model('wiki80_cnn_softmax')
model.infer({'text': 'He was the son of Máel Dúin mac Máele Fithrich, and grandson of the high king Áed Uaridnach (died 612).', 'h': {'pos': (18, 46)}, 't': {'pos': (78, 91)}})

In [None]:
# Exploration of OpenNRE
# Explore wiki80 training data (to get an idea of training data format)
# Load the file
path='/home/lani_lichtenstein/.opennre/benchmark/wiki80/wiki80_train.txt'
f = open(path)
data = []
for line in f.readlines():
    line = line.rstrip()
    if len(line) > 0:
        data.append(eval(line))
      

In [None]:
# explore data
type(data[0]) # data is a list of dicts 
data[0]['t'] # explore tail value of first element


In [None]:
wiki_data=pd.DataFrame(data)
wiki_data.columns
wiki_data.shape
wiki_data.iloc[0]
wiki_data.head()

### Train - use Bert as model, on our training data, 

In [None]:

#https://github.com/thunlp/OpenNRE/blob/master/example/train_nyt10_pcnn_att.py
    
import sys, json
import torch
import os
import numpy as np
import opennre
from opennre import encoder, model, framework
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bag_size', type=int, default=0)
args = parser.parse_args()

# Some basic settings
root_path = '.'
if not os.path.exists('ckpt'):
    os.mkdir('ckpt')
ckpt = 'ckpt/nyt10_pcnn_att.pth.tar'

# Check data
opennre.download('nyt10', root_path=root_path)
opennre.download('glove', root_path=root_path)
rel2id = json.load(open(os.path.join(root_path, 'benchmark/nyt10/nyt10_rel2id.json')))
wordi2d = json.load(open(os.path.join(root_path, 'pretrain/glove/glove.6B.50d_word2id.json')))
word2vec = np.load(os.path.join(root_path, 'pretrain/glove/glove.6B.50d_mat.npy'))

# Define the sentence encoder
sentence_encoder = opennre.encoder.PCNNEncoder(
    token2id=wordi2d,
    max_length=120,
    word_size=50,
    position_size=5,
    hidden_size=230,
    blank_padding=True,
    kernel_size=3,
    padding_size=1,
    word2vec=word2vec,
    dropout=0.5
)

# Define the model
model = opennre.model.BagAttention(sentence_encoder, len(rel2id), rel2id)

# Define the whole training framework
framework = opennre.framework.BagRE(
    train_path='benchmark/nyt10/nyt10_train.txt',
    val_path='benchmark/nyt10/nyt10_val.txt',
    test_path='benchmark/nyt10/nyt10_test.txt',
    model=model,
    ckpt=ckpt,
    batch_size=160,
    max_epoch=60,
    lr=0.5,
    weight_decay=0,
    opt='sgd',
    bag_size=args.bag_size)

# Train the model
framework.train_model()

# Test the model
framework.load_state_dict(torch.load(ckpt)['state_dict'])
result = framework.eval_model(framework.test_loader)

# Print the result
print('AUC on test set: {}'.format(result['auc']))

### Compare with relations from covid19kg dataset

In [None]:
#load graph pre-procesed by Charlie Hoyt: https://github.com/CoronaWhy/bel4corona/tree/master/data/covid19kg
url = 'https://github.com/CoronaWhy/bel4corona/raw/master/data/covid19kg/covid19-fraunhofer-grounded.bel.nodelink.json'
res = requests.get(url)