# Training Roberta for relation classification
This is a notebook based on the code from https://www.kaggle.com/xhlulu/jigsaw-tpu-xlm-roberta. Our goal is to train a binary classification model to determine if drug-treatments are relevant.

In [0]:
!pip install transformers

In [0]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [0]:
# this data was made with the make-train notebook
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_train_data.csv drug_train_data.csv
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_dev_data.csv drug_dev_data.csv 
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_test_data.csv drug_test_data.csv
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_train_data.csv synth_train_data.csv
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_dev_data.csv synth_dev_data.csv 
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_test_data.csv synth_test_data.csv

Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_train_data.csv...
/ [1 files][ 50.6 KiB/ 50.6 KiB]                                                
Operation completed over 1 objects/50.6 KiB.                                     
Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_dev_data.csv...
- [1 files][  1.5 KiB/  1.5 KiB]                                                
Operation completed over 1 objects/1.5 KiB.                                      
Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_test_data.csv...
/ [1 files][ 35.2 KiB/ 35.2 KiB]                                                
Operation completed over 1 objects/35.2 KiB.                                     
Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_train_data.csv...
- [1 files][386.1 KiB/386.1 KiB]                                                
Operation completed over 1 objects/386.1 KiB.                                    
Copying gs://coronaviruspublic

In [0]:
def create_train_dev_test(prefix):
  train = pd.read_csv(prefix + "_train_data.csv")
  dev = pd.read_csv(prefix + "_dev_data.csv")
  test = pd.read_csv(prefix + "_test_data.csv")
  return train, test, dev

transfer_train, transfer_test, transfer_dev = create_train_dev_test("synth")
drug_train, drug_test, drug_dev = create_train_dev_test("drug")
drug_full = pd.concat([drug_train, drug_test])


In [0]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    return np.array(enc_di['input_ids'])

In [0]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-6), loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), 'accuracy'])
    return model

In [0]:
max_len = int(int(drug_train.sentence.str.len().max()))
avg_len = int(drug_train.sentence.str.len().median())

# Configuration
EPOCHS = 4
MAX_LEN = 512

In [0]:
MODEL = "allenai/biomed_roberta_base" #'xlnet-base-cased'

# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(IntProgress(value=0, description='Downloading', max=495, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=898822, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=2, style=ProgressStyle(description_width='i…




HBox(children=(IntProgress(value=0, description='Downloading', max=150, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=185, style=ProgressStyle(description_width=…




In [0]:
%%time 
x_train = regular_encode(transfer_train.sentence.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(transfer_dev.sentence.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(transfer_test.sentence.values, tokenizer, maxlen=MAX_LEN)
y_train = transfer_train.label.values
y_valid = transfer_dev.label.values
y_test  = transfer_test.label.values

CPU times: user 978 ms, sys: 0 ns, total: 978 ms
Wall time: 977 ms


In [0]:
es = EarlyStopping(monitor='val_accuracy', 
                    min_delta=0.001, 
                    patience=3,
                    verbose=1, 
                    mode='max', 
                    restore_best_weights=True)

In [0]:
!pip install wandb
!wandb login
import wandb
from wandb.keras import WandbCallback
wandb.init(project="vt-relation-extract", sync_tensorboard=True)

[34m[1mwandb[0m: You can find your API key in your browser here: https://app.wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 4c616b51e6e88012c20dc6adcf90d05172185490
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


W&B Run: https://app.wandb.ai/igodfried/vt-relation-extract/runs/1kfdjv7w

In [0]:
from transformers import AutoModel
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
model = AutoModel.from_pretrained("allenai/biomed_roberta_base")
!mkdir biomed_roberta_base
model.save_pretrained("biomed_roberta_base")
with strategy.scope():
  model = TFAutoModel.from_pretrained("biomed_roberta_base", from_pt=True)
  model = build_model(model)
BATCH_SIZE = 2 * strategy.num_replicas_in_sync


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
mkdir: cannot create directory ‘biomed_roberta_base’: File exists


In [0]:
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_roberta_model_4 (TFRobert ((None, 512, 768), (None, 124645632 
_________________________________________________________________
tf_op_layer_strided_slice_4  [(None, 768)]             0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 769       
Total params: 124,646,401
Trainable params: 124,646,401
Non-trainable params: 0
_________________________________________________________________


In [0]:
# This may look like a bug but in reality we only care about the performance on
# the annotated drug data and not what we are training on.
x_test = regular_encode(drug_full.sentence.values, tokenizer, maxlen=MAX_LEN)
y_test  = drug_full.label.values

train_history = model.fit(
                        x_train, y_train,
                        batch_size = BATCH_SIZE,
                        validation_data=(x_test, y_test),
                        callbacks=[es, WandbCallback()],
                        epochs=EPOCHS
                        )

Epoch 1/4

[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 


Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 00004: early stopping


In [0]:
x_train = regular_encode(drug_train.sentence.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(drug_dev.sentence.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(drug_test.sentence.values, tokenizer, maxlen=MAX_LEN)
y_train = drug_train.label.values
y_valid = drug_dev.label.values
y_test  = drug_test.label.values

In [80]:
train_history = model.fit(
                        x_train, y_train,
                        batch_size=BATCH_SIZE,
                        validation_data=(x_test, y_test),
                        callbacks=[es, WandbCallback()],
                        epochs=8
                        )

Epoch 1/8

[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 00004: early stopping


### Saving/Exporting
A model isn't useful if it cannot be used in a production pipeline.

In [0]:
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!gsutil cp -r best_epoch_roberta gs://coronaviruspublicdata/temp_data/snapshots

Copying file://best_epoch_roberta/config.json [Content-Type=application/json]...
Copying file://best_epoch_roberta/saved_model.pb [Content-Type=application/octet-stream]...
Copying file://best_epoch_roberta/variables/variables.index [Content-Type=application/octet-stream]...
Copying file://best_epoch_roberta/variables/variables.data-00001-of-00002 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite obje

In [0]:
import pickle 
pickle.dump(model, open( "model.pickle", "wb" ) )
!gsutil cp model.pickle gs://coronaviruspublicdata/model.pickle

Copying file://model.pickle [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/475.6 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\
Operation completed over 1 objects/475.6 MiB.                                    


In [85]:
import pickle
def save_model(model, transformer_dir='transformer3'):
    """
    Special function to load a keras model that uses a transformer layer
    """
    transformer = model.layers[1]
    transformer.save_pretrained(transformer_dir)
    sigmoid = model.get_layer(index=3).get_weights()
    pickle.dump(sigmoid, open('sigmoid3.pickle', 'wb'))

def load_model(transformer_dir='transformer3', max_len=256):
    """
    Special function to load a keras model that uses a transformer layer
    """
    transformer = TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    sigmoid = pickle.load(open('sigmoid3.pickle', 'rb'))
    model.get_layer('sigmoid').set_weights(sigmoid)

    return model
!mkdir transformer3
save_model(model)


mkdir: cannot create directory ‘transformer3’: File exists


In [0]:
model.summary()
model.get_layer(index=3)

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_roberta_model_2 (TFRobert ((None, 512, 768), (None, 124645632 
_________________________________________________________________
tf_op_layer_strided_slice_2  [(None, 768)]             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 769       
Total params: 124,646,401
Trainable params: 124,646,401
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.layers.core.Dense at 0x7f8ea9462cf8>

In [86]:
!gsutil cp -r transformer3 gs://coronaviruspublicdata/re_final_best2/s
!gsutil cp sigmoid3.pickle gs://coronaviruspublicdata/re_final_best2/s

Copying file://transformer3/tf_model.h5 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/475.7 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://transformer3/config.json [Content-Type=application/json]...
\ [2 files][475.7 MiB/475.7 MiB]                                                
Oper

### Qualitative Evaluation
We will now qualitatively look at a few examples.

In [0]:
test_examples = regular_encode(["As with Acacia and PAN, the LAC prospectus affirms that digital technologies and ICTbased solutions provide a powerful tool to change the ways in which health services are managed and delivered to the population at large, and to low-income and marginalized communities in particular"], tokenizer, maxlen=MAX_LEN)
model.predict(test_examples)

array([[0.16207048]], dtype=float32)

In [0]:
test_examples = regular_encode(["Glatiramer acetate (Copaxone) therapy induces an oligoclonal CD8+ T cell response with cytotoxic ability R"], tokenizer, maxlen=MAX_LEN)
model.predict(test_examples)

array([[0.81116337]], dtype=float32)

In [0]:
predictions = model.predict(x_valid)
drug_dev.sentence.values[0]
drug_dev.label.values[0]

In [0]:
print(predictions[1])
print("sentence is " + drug_dev.sentence.values[1])
print("real label is " + str(drug_dev.label.values[1]))

[0.58099353]
sentence is 17 Exposure to various therapeutic agents (antibiotics, acetaminophen, and bronchodilators) during the week before admission appeared to have a "protective" association against apnea in unadjusted analyses
real label is 1


In [0]:
print(predictions[2])
print("sentence is: " + drug_dev.sentence.values[2])
print("real label is " + str(drug_dev.label.values[2]))

[0.5998223]
sentence is Infants with apnea were significantly less likely to have been exposed to several therapeutic agents (antibiotics, acetaminophen, and bronchodilators) during the week before admission and were less likely to have attended day care (Table 1) 
real label is 1


### Tests for RAM usage
Basic check to determine how much RAM is available.

In [0]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7413 sha256=ff7cab81de177738d24f0afe86e1f346009de0524a88f0340b3eebfa2eaaec04
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 26.2 GB  | Proc size: 589.4 MB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total 16280MB


In [0]:
from google.colab import auth
from datetime import datetime
auth.authenticate_user()


Copying file://transformer/tf_model.h5 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/475.7 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://transformer/config.json [Content-Type=application/json]...
\ [2 files][475.7 MiB/475.7 MiB]                                                
Operat

In [0]:
!gsutil cp -r transformer gs://coronaviruspublicdata/re_snapshot/4_13_2020
!gsutil cp sigmoid.pickle gs://coronaviruspublicdata/re_snapshot/4_13_2020

Copying file://transformer/tf_model.h5 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/475.7 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://transformer/config.json [Content-Type=application/json]...
\ [2 files][475.7 MiB/475.7 MiB]                                                
Operat