In [1]:
model_name= 'HAN_opt'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys 
sys.path.append('../')

In [4]:
import os
import tensorflow 
import numpy as np
import random

seed_value = 11111
#seed_value = None

environment_name = sys.executable.split('/')[-3]
print('Environment:', environment_name)
os.environ[environment_name] = str(seed_value)

np.random.seed(seed_value)
random.seed(seed_value)
tensorflow.random.set_seed(seed_value)

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
import tensorflow.compat.v1.keras.backend as K
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
K.set_session(session)

tensorflow.__version__

Environment: biotmpygpu


'2.2.0'

In [5]:
multiple_gpus = [0,1,2,3]
#multiple_gpus = None

In [6]:
import os
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

if multiple_gpus:
    devices = []
    for gpu in multiple_gpus:
        devices.append('/gpu:' + str(gpu))    
    strategy = tensorflow.distribute.MirroredStrategy(devices=devices)

else:
    # Get the GPU device name.
    device_name = tensorflow.test.gpu_device_name()
    # The device name should look like the following:
    if device_name == '/device:GPU:0':
        print('Using GPU: {}'.format(device_name))
    else:
        raise SystemError('GPU device not found')

    os.environ["CUDA_VISIBLE_DEVICES"] = device_name
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

Num GPUs Available:  4
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [7]:
from wrappers.bioc_wrapper import bioc_to_docs, bioc_to_relevances
from wrappers.pandas_wrapper import relevances_to_pandas, docs_to_pandasdocs
from preprocessing.dl import DL_preprocessing
from mlearning.dl_models import Hierarchical_Attention_GRU, Hierarchical_Attention_LSTM,Hierarchical_Attention_LSTM2, Hierarchical_Attention_LSTM3
from mlearning.dl_models import Hierarchical_Attention_Context, HAN_opt
from mlearning.dl_models import DeepDTA
from preprocessing.embeddings import compute_embedding_matrix, glove_embeddings_2
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score, auc, roc_curve, precision_recall_curve
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from preprocessing.dl import plot_training_history
from preprocessing.config import Config
from preprocessing.dl import average_precision
from tensorflow.keras.preprocessing import text
from preprocessing.dl import plot_roc_n_pr_curves
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
import seaborn as sns
import pandas as pd
import os
from keras import backend as K
import pickle

[nltk_data] Downloading package stopwords to /home/malves/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/malves/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/malves/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
train_dataset_path = '../datasets/PMtask_Triage_TrainingSet.xml'
test_dataset_path = '../datasets/PMtask_Triage_TestSet.xml'

## Load Data

In [9]:
config = Config(model_name=model_name, seed_value=seed_value)
config.stop_words = set(stopwords.words('english'))           
#config.stop_words = None
config.lower = True               
config.remove_punctuation = False
config.split_by_hyphen = True
config.lemmatization = False           
config.stems = False                      


docs_train = bioc_to_docs(train_dataset_path, config=config)
relevances_train = bioc_to_relevances(train_dataset_path, 'protein-protein')


x_train_df = docs_to_pandasdocs(docs_train)
y_train_df = relevances_to_pandas(x_train_df, relevances_train)

In [10]:
x_train_df

Unnamed: 0,Document
9685346,<data_structures.document.Document object at 0...
10364224,<data_structures.document.Document object at 0...
10688642,<data_structures.document.Document object at 0...
12059041,<data_structures.document.Document object at 0...
12897151,<data_structures.document.Document object at 0...
...,...
22521144,<data_structures.document.Document object at 0...
25759389,<data_structures.document.Document object at 0...
19887646,<data_structures.document.Document object at 0...
23486661,<data_structures.document.Document object at 0...


In [11]:
y_train_df

9685346     0
10364224    0
10688642    0
12059041    0
12897151    0
           ..
22521144    1
25759389    1
19887646    1
23486661    1
22992732    1
Name: Label, Length: 4082, dtype: int64

In [12]:
x_train_df['Document'][0].title_string

'molecular basis rieger syndrome analysis pitx2 homeodomain protein activities rieger syndrome autosomal - dominant developmental disorder includes glaucoma mild craniofacial dysmorphism humans'

In [13]:
x_train_df['Document'][0].abstract_string

'mutations pitx2 homeobox gene linked rieger syndrome characterized wild type mutant pitx2 activities using electrophoretic mobility shift assays protein binding transient transfection assays pitx2 preferentially binds bicoid homeodomain binding site transactivates reporter genes containing site combination pitx2 another homeodomain protein pit - 1 yielded synergistic 55 - fold activation prolactin promoter transfection assays addition pit - 1 increased pitx2 binding bicoid element electrophoretic mobility shift assays furthermore demonstrate specific binding pit - 1 pitx2 vitro thus wild type pitx2 dna binding activity modulated protein - protein interactions next studied two rieger mutants threonine proline mutation t68p second helix homeodomain retained dna binding activity apparent kd 2 - fold reduction bmax however mutant transactivate reporter genes containing bicoid site mutant pitx2 protein binds pit - 1 detectable synergism prolactin promoter second mutation l54q highly conser

### Embeddings and Deep Learning

In [14]:
#Parameters
config.padding = 'post'            #'pre' -> default; 'post' -> alternative
config.truncating = 'post'         #'pre' -> default; 'post' -> alternative      #####
config.oov_token = 'OOV'

config.epochs = 50
config.batch_size = 32     # e aumentar o batch
config.learning_rate = 0.001   #experimentar diminuir

config.max_sent_len = 50      #sentences will have a maximum of "max_sent_len" words    #400/500
config.max_nb_words = 100_000      #it will only be considered the top "max_nb_words" words in the dataset
config.max_nb_sentences = 15    # set only for the hierarchical attention model!!!

config.embeddings = 'biowordvec'

config.validation_percentage = 10

if not os.path.isdir('./embeddings'):
    !mkdir embeddings

if config.embeddings == 'glove':
    if not os.path.isfile('./embeddings/glove.6B.zip'):
        # !wget -P ./embeddings http://nlp.stanford.edu/data/glove.6B.zip
        !unzip  ./embeddings/glove.6B.zip  -d ./embeddings
    config.embedding_path = '/embeddings/glove/glove.6B.200d.txt'
    config.embedding_dim = 200
    config.embedding_format = 'glove'

elif config.embeddings == 'biowordvec':   #200 dimensions
    if not os.path.isfile('./embeddings/biowordvec'):
        !wget -O ./embeddings/biowordvec https://ndownloader.figshare.com/files/12551780
    config.embedding_path = './embeddings/biowordvec'
    config.embedding_dim = 200
    config.embedding_format = 'word2vec'

elif config.embeddings == 'pubmed_pmc':   #200 dimensions
    if not os.path.isfile('./embeddings/pubmed_pmc.bin'):
        !wget -O ./embeddings/pubmed_pmc.bin http://evexdb.org/pmresources/vec-space-models/PubMed-and-PMC-w2v.bin
    config.embedding_path = '/embeddings/pubmed_pmc.bin'
    config.embedding_dim = 200
    config.embedding_format = 'word2vec'

elif config.embeddings == 'pubmed_ncbi':   #100 dimensions
    if not os.path.isfile('./embeddings/pubmed_ncbi.bin.gz'):
        !wget -O ./embeddings/pubmed_ncbi.bin.gz ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/EMBED/pubmed_s100w10_min.bin.gz
    config.embedding_path = '/embeddings/pubmed_ncbi.bin.gz'
    config.embedding_dim = 100
    config.embedding_format = 'word2vec'    

else: 
    raise Exception("Please Insert Embeddings Type")

### Keras Callbacks

In [15]:
config.keras_callbacks = True

if config.keras_callbacks:
    config.patience = 5   #early-stopping patience
    checkpoint_path = str(config.model_id_path) + '/checkpoint.hdf5'
    keras_callbacks = [
            EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=config.patience),
           ModelCheckpoint(checkpoint_path, monitor='val_loss', mode='min', verbose=1, save_best_only=True)
    ]
else:
    keras_callbacks=None

In [16]:
#Preprocessing for Training Data

config.tokenizer = text.Tokenizer(num_words=config.max_nb_words, oov_token=config.oov_token)

print(x_train_df['Document'][0].fulltext_string)
for i, tok in enumerate(x_train_df['Document'][0].fulltext_tokens):
    print(i, ': ', tok)

molecular basis rieger syndrome analysis pitx2 homeodomain protein activities rieger syndrome autosomal - dominant developmental disorder includes glaucoma mild craniofacial dysmorphism humans mutations pitx2 homeobox gene linked rieger syndrome characterized wild type mutant pitx2 activities using electrophoretic mobility shift assays protein binding transient transfection assays pitx2 preferentially binds bicoid homeodomain binding site transactivates reporter genes containing site combination pitx2 another homeodomain protein pit - 1 yielded synergistic 55 - fold activation prolactin promoter transfection assays addition pit - 1 increased pitx2 binding bicoid element electrophoretic mobility shift assays furthermore demonstrate specific binding pit - 1 pitx2 vitro thus wild type pitx2 dna binding activity modulated protein - protein interactions next studied two rieger mutants threonine proline mutation t68p second helix homeodomain retained dna binding activity apparent kd 2 - fold

In [17]:
x_train, y_train, x_val, y_val = DL_preprocessing(x_train_df, y_train_df, 
                                                  config, dataset='train',
                                                  validation_percentage = config.validation_percentage, 
                                                  seed_value=config.seed_value)

Found 30020 unique tokens.
Index of Unknown Words: 1
Training set with 3674 samples
Validation set with 408 samples


In [18]:
x_train[0][0]

array([   6,  786,    1,  115,  786,    1,   17,  786,    1,  327,   72,
       3282,    8,  225,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [19]:
print('Number of Documents:', len(x_train))
print('Number of Sentences: ', len(x_train[0]))
print('Max number of words in a Sentence:', len(x_train[0][0]))

count_zeros = total_words = 0
for doc in x_train:
    for sentence in doc:
        for word in sentence:
            total_words += 1
            if word == 0:
                count_zeros += 1
print('Percentage of Zeros: {:.2%}'.format(count_zeros/total_words))


for i in x_train[0]:
    print(i)

Number of Documents: 3674
Number of Sentences:  15
Max number of words in a Sentence: 50
Percentage of Zeros: 79.14%
[   6  786    1  115  786    1   17  786    1  327   72 3282    8  225
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
[ 616  364  114 6607 1603  746   40  189   83  370    8  225  628    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
[ 786    1   17  786    1  115   23 7584    7  155  786    1  327 1737
    7  225    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
[ 786    1  327  212  833  386    1 1798  347  386    1   14   62    8
  225  661   56   96  489   23  786    1  115  

In [20]:
config.embedding_matrix = compute_embedding_matrix(config, embeddings_format = config.embedding_format)

Creating Embedding Matrix...
Embedding Matrix Created 
------------------------
number of null word embeddings: 3025 in a total of 30020 words (10.08%)
words not found: 0


In [21]:
config.embedding_matrix.shape

(30021, 200)

In [22]:
protein_index = config.tokenizer.word_index['protein']
print("Dimension of the word embedding for the word 'protein': ", len(config.embedding_matrix[protein_index]))
print("Index for the word 'protein': ", protein_index)

Dimension of the word embedding for the word 'protein':  200
Index for the word 'protein':  2


In [23]:
if multiple_gpus:
    with strategy.scope():
        model = HAN_opt(config.embedding_matrix, config, learning_rate=config.learning_rate,
                                               seed_value=config.seed_value) 
else:
    model = HAN_opt(config.embedding_matrix, config, learning_rate=config.learning_rate,
                                                seed_value=config.seed_value) 
    

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 15, 50)]          0         
_________________________________________________________________
time_distributed (TimeDistri (None, 15, 256)           6407144   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 15, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 15, 512)           1050624   
_________________________________________________________________
attention_with_context_1 (At (None, 512)               263168    
_________________________________________________________________
dense (Dense)                (None, 1)                 513       
Total params: 7,721,449
Trainable params: 1,717,249
Non-trainable params: 6,004,200
_________________________________________

In [None]:
history = model.fit(x_train, y_train,
                    epochs=config.epochs,
                    batch_size=config.batch_size,
                    validation_data=(x_val,y_val),
                    callbacks=keras_callbacks)

if config.keras_callbacks:
    model.load_weights(checkpoint_path)

Epoch 1/50
INFO:tensorflow:batch_all_reduce: 20 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 20 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/t

## Evaluation - Training Set

In [None]:
train_loss, config.train_acc = model.evaluate(x_train, y_train, verbose=0, batch_size = config.batch_size)

print('Training Loss: %.3f' % (train_loss))
print('Training Accuracy: %.3f' % (config.train_acc))

plot_training_history(history_dict = history, config=config)

# Test Set

### Load Data

In [None]:
docs_test = bioc_to_docs(test_dataset_path, config=config)
relevances_test = bioc_to_relevances(test_dataset_path, 'protein-protein')

x_test_df = docs_to_pandasdocs(docs_test)
y_test_df = relevances_to_pandas(x_test_df, relevances_test)

x_test, y_test = DL_preprocessing(x_test_df, y_test_df, config, dataset = 'test')

nmr_unknown_words=0
total_words = 0
for doc in x_test:
    for sentence in doc:
        for word in sentence:
            if word != 0:
                total_words += 1
            if word == 1:
                nmr_unknown_words+=1

print('Percentage of Unknown Words on the Test Set: {:.2%} ({} in {})'.format(nmr_unknown_words/total_words, nmr_unknown_words,total_words))

### Predictions

In [None]:
yhat_probs = model.predict(x_test, verbose=0)
yhat_probs = yhat_probs[:, 0]

yhat_classes = np.where(yhat_probs > 0.5, 1, yhat_probs)
yhat_classes = np.where(yhat_classes < 0.5, 0, yhat_classes).astype(np.int64)

## Evaluation - Test Set

### ROC and Precision-Recall curves

In [None]:
config.test_roc_auc, config.test_pr_auc = plot_roc_n_pr_curves(y_test, yhat_probs,config = config)

In [None]:
config.test_avg_prec  = average_precision(y_test_df, yhat_probs)
print('Average Precision: %f' % config.test_avg_prec)

# accuracy: (tp + tn) / (p + n)
config.test_acc = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % config.test_acc)

# precision tp / (tp + fp)
config.test_prec = precision_score(y_test, yhat_classes)
print('Precision: %f' % config.test_prec)

# recall: tp / (tp + fn)
config.test_recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % config.test_recall)

# f1: 2 tp / (2 tp + fp + fn)
config.test_f1_score = f1_score(y_test, yhat_classes)
print('F1 score: %f' % config.test_f1_score)

# ROC AUC
print('ROC AUC: %f' % config.test_roc_auc)

# PR AUC
print('PR AUC: %f' % config.test_pr_auc)

# kappa
config.test_kappa = cohen_kappa_score(y_test, yhat_classes)
print('Cohens kappa: %f' % config.test_kappa)

config.test_mcc = matthews_corrcoef(y_test, yhat_classes)
print('MCC: %f' % config.test_mcc)

# confusion matrix
matrix = confusion_matrix(y_test, yhat_classes)
print('Confusion Matrix:\n %s \n' % matrix)

config.test_true_neg, config.test_false_pos, config.test_false_neg, config.test_true_pos = confusion_matrix(
                                                                                                y_test, yhat_classes).ravel()

### Model ID

In [None]:
config.model_id

### Save config

In [None]:
config.save()

In [None]:
config.path

### Write Results

In [None]:
config.write_report()

### Model Save

In [None]:
model.save(config.model_id_path / 'model_tf', save_format = 'tf')

In [None]:
model_yaml = model.to_yaml()
with open(config.model_id_path / "model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)

In [None]:
seed_value