In [1]:
model_name= 'bert_dense_ft'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys 
sys.path.append('../')

In [4]:
import os
import tensorflow 
import numpy as np
import random

seed_value = 123123
#seed_value = None

environment_name = sys.executable.split('/')[-3]
print('Environment:', environment_name)
os.environ[environment_name] = str(seed_value)

np.random.seed(seed_value)
random.seed(seed_value)
tensorflow.random.set_seed(seed_value)

import torch
if seed_value:
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
np.random.seed(seed_value)
random.seed(seed_value)
os.environ['PYTHONHASHSEED'] = str(seed_value)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
import tensorflow.compat.v1.keras.backend as K
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
K.set_session(session)

tensorflow.__version__

Environment: biotmpygpu


'2.2.0'

In [5]:
multiple_gpus = [0,1,2,3]
#multiple_gpus = None

In [6]:
import os
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

if multiple_gpus:
    devices = []
    for gpu in multiple_gpus:
        devices.append('/gpu:' + str(gpu))    
    strategy = tensorflow.distribute.MirroredStrategy(devices=devices)
    os.environ["CUDA_VISIBLE_DEVICES"] = ''

else:
    # Get the GPU device name.
    device_name = tensorflow.test.gpu_device_name()
    # The device name should look like the following:
    if device_name == '/device:GPU:0':
        print('Using GPU: {}'.format(device_name))
    else:
        raise SystemError('GPU device not found')

    os.environ["CUDA_VISIBLE_DEVICES"] = device_name
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

Num GPUs Available:  4
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [7]:
from wrappers.bioc_wrapper import bioc_to_docs, bioc_to_relevances
from wrappers.pandas_wrapper import relevances_to_pandas, docs_to_pandasdocs
from preprocessing.dl import DL_preprocessing
from mlearning.dl_models import Bert_Dense, Bert_LSTM, Bert_CLS, Bert_Sequence
from preprocessing.dl import Bert_preprocessing
from preprocessing.embeddings import compute_embedding_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import matthews_corrcoef, cohen_kappa_score
from sklearn.metrics import roc_auc_score, auc, roc_curve, precision_recall_curve
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from preprocessing.dl_config import DLConfig
from preprocessing.dl import average_precision
from preprocessing.dl import plot_roc_n_pr_curves, plot_training_history
from transformers import BertTokenizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
import pandas as pd
import os
from transformers import AutoTokenizer

[nltk_data] Downloading package stopwords to /home/malves/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/malves/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/malves/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:

train_dataset_path = '../datasets/PMtask_Triage_TrainingSet.xml'
test_dataset_path = '../datasets/PMtask_Triage_TestSet.xml'

## Load Data

In [9]:
dl_config = DLConfig(model_name=model_name, seed_value=seed_value)

#dl_config.stop_words = set(stopwords.words('english'))            #####
dl_config.stop_words = None
dl_config.lower = False                #####
dl_config.remove_punctuation = False
dl_config.split_by_hyphen = False   
dl_config.lemmatization = False           #####
dl_config.stems = False                       #####


docs_train = bioc_to_docs(train_dataset_path, dl_config=dl_config)
relevances_train = bioc_to_relevances(train_dataset_path, 'protein-protein')


x_train_df = docs_to_pandasdocs(docs_train)
y_train_df = relevances_to_pandas(x_train_df, relevances_train)

In [10]:
x_train_df

Unnamed: 0,Document
9685346,<data_structures.document.Document object at 0...
10364224,<data_structures.document.Document object at 0...
10688642,<data_structures.document.Document object at 0...
12059041,<data_structures.document.Document object at 0...
12897151,<data_structures.document.Document object at 0...
...,...
22521144,<data_structures.document.Document object at 0...
25759389,<data_structures.document.Document object at 0...
19887646,<data_structures.document.Document object at 0...
23486661,<data_structures.document.Document object at 0...


In [11]:
y_train_df

9685346     0
10364224    0
10688642    0
12059041    0
12897151    0
           ..
22521144    1
25759389    1
19887646    1
23486661    1
22992732    1
Name: Label, Length: 4082, dtype: int64

In [12]:
x_train_df['Document'][0].title_string

'The molecular basis of Rieger syndrome . Analysis of Pitx2 homeodomain protein activities .'

In [13]:
x_train_df['Document'][0].abstract_string

'Rieger syndrome is an autosomal-dominant developmental disorder that includes glaucoma and mild craniofacial dysmorphism in humans . Mutations in the Pitx2 homeobox gene have been linked to Rieger syndrome . We have characterized wild type and mutant Pitx2 activities using electrophoretic mobility shift assays , protein binding , and transient transfection assays . Pitx2 preferentially binds the bicoid homeodomain binding site and transactivates reporter genes containing this site . The combination of Pitx2 and another homeodomain protein , Pit-1 , yielded a synergistic 55-fold activation of the prolactin promoter in transfection assays . Addition of Pit-1 increased Pitx2 binding to the bicoid element in electrophoretic mobility shift assays . Furthermore , we demonstrate specific binding of Pit-1 to Pitx2 in vitro . Thus , wild type Pitx2 DNA binding activity is modulated by protein-protein interactions . We next studied two Rieger mutants . A threonine to proline mutation ( T68P ) i

### Parameters

In [14]:
#Parameters
dl_config.padding = 'post'           
dl_config.truncating = 'post'        

dl_config.epochs = 3         # recommended number of epochs: 2, 3, 4 
dl_config.batch_size = 16     # recommended batch-size: 16 or 32 
dl_config.learning_rate = 2e-5   # recommended learning rate for Adam: 5e-5, 3e-5, 2e-5   # 3e-4, 1e-4,

dl_config.max_sent_len = 512      #sentences will have a maximum of "max_sent_len" words
dl_config.nmr_sentences = 1      #[1 or 2]

dl_config.validation_percentage = 10

### Keras Callbacks

In [15]:
dl_config.keras_callbacks = False

if dl_config.keras_callbacks:
    dl_config.patience = 2   #early-stopping patience
    checkpoint_path = str(dl_config.model_id_path) + '/checkpoint.hdf5'
    keras_callbacks = [
            EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=dl_config.patience),
            ModelCheckpoint(checkpoint_path, monitor='val_loss', mode='min', verbose=1, save_best_only=True)
    ]
else:
    keras_callbacks=None

## Preprocessing

## Bert

In [16]:
dl_config.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [17]:
dl_config.tokenizer.convert_tokens_to_ids("[SEP]")

102

In [18]:
x_train, y_train, x_val, y_val =  Bert_preprocessing(x_train_df, y_train_df, 
                                                     dl_config, 
                                                     nmr_sentences = dl_config.nmr_sentences, 
                                                     validation_percentage = dl_config.validation_percentage, 
                                                     seed_value=dl_config.seed_value)

Training set with 3674 samples
Validation set with 408 samples


In [19]:
len(x_train[0][0])

512

In [20]:
dl_config.tokenizer.convert_ids_to_tokens(x_train[0][0])

['[CLS]',
 'crystal',
 'structure',
 'analysis',
 'of',
 'the',
 'phd',
 'domain',
 'of',
 'the',
 'transcription',
 'co',
 '-',
 'act',
 '##iva',
 '##tor',
 'p',
 '##y',
 '##go',
 '##pus',
 '.',
 'the',
 'w',
 '##nt',
 '/',
 'beta',
 '-',
 'cat',
 '##eni',
 '##n',
 'signaling',
 'pathway',
 'plays',
 'important',
 'roles',
 'in',
 'animal',
 'development',
 'and',
 'cancer',
 '.',
 'p',
 '##y',
 '##go',
 '##pus',
 '(',
 'p',
 '##y',
 '##go',
 ')',
 'and',
 'leg',
 '##less',
 '(',
 'l',
 '##gs',
 ')',
 'are',
 'recently',
 'discovered',
 'core',
 'components',
 'of',
 'the',
 'w',
 '##nt',
 '/',
 'beta',
 '-',
 'cat',
 '##eni',
 '##n',
 'transcription',
 'machinery',
 'complex',
 ',',
 'and',
 'are',
 'crucial',
 '##ly',
 'involved',
 'in',
 'the',
 'regulation',
 'of',
 'the',
 'transcription',
 'of',
 'the',
 'arm',
 '/',
 'beta',
 '-',
 'cat',
 '##eni',
 '##n',
 'and',
 't',
 'cell',
 'factors',
 '(',
 'tc',
 '##f',
 ')',
 '.',
 'l',
 '##gs',
 '/',
 'bc',
 '##l',
 '##9',
 'functions

In [21]:
dl_config.tokenizer.convert_ids_to_tokens(x_val[0][0])

['[CLS]',
 'the',
 'structure',
 'of',
 'mouse',
 'hp',
 '##1',
 'suggests',
 'a',
 'unique',
 'mode',
 'of',
 'single',
 'peptide',
 'recognition',
 'by',
 'the',
 'shadow',
 'ch',
 '##rom',
 '##o',
 'domain',
 'dime',
 '##r',
 '.',
 'the',
 'het',
 '##ero',
 '##ch',
 '##rom',
 '##atin',
 'protein',
 '1',
 '(',
 'hp',
 '##1',
 ')',
 'family',
 'of',
 'proteins',
 'is',
 'involved',
 'in',
 'gene',
 'si',
 '##len',
 '##cing',
 'via',
 'the',
 'formation',
 'of',
 'het',
 '##ero',
 '##ch',
 '##romatic',
 'structures',
 '.',
 'they',
 'are',
 'composed',
 'of',
 'two',
 'related',
 'domains',
 ':',
 'an',
 'n',
 '-',
 'terminal',
 'ch',
 '##rom',
 '##o',
 'domain',
 'and',
 'a',
 'c',
 '-',
 'terminal',
 'shadow',
 'ch',
 '##rom',
 '##o',
 'domain',
 '.',
 'present',
 'results',
 'suggest',
 'that',
 'ch',
 '##rom',
 '##o',
 'domains',
 'may',
 'function',
 'as',
 'protein',
 'interaction',
 'motifs',
 ',',
 'bringing',
 'together',
 'different',
 'proteins',
 'in',
 'multi',
 '-',
 'pro

## Deep Learning

In [22]:
from mlearning.dl_models import Bert_Dense_opt
bert_name = "bert-base-uncased"
if multiple_gpus:
    with strategy.scope():
        model = Bert_Dense(dl_config, learning_rate=dl_config.learning_rate,static_bert=False, bert_name_or_path=bert_name)
        #model = Bert_FE(dl_config, learning_rate=dl_config.learning_rate, bert_name_or_path=bert_name)
        #model = Bert_Sequence_FT(dl_config, learning_rate=dl_config.learning_rate, bert_name_or_path=bert_name)
        #model = Bert_Sequence_FE(dl_config, learning_rate=dl_config.learning_rate, bert_name_or_path=bert_name)
        
else:
    model = Bert_Dense(dl_config, learning_rate=dl_config.learning_rate, static_bert=False, bert_name_or_path=bert_name)
    #model = Bert_FE(dl_config, learning_rate=dl_config.learning_rate, bert_name_or_path=bert_name)
    #model = Bert_Sequence_FT(dl_config, learning_rate=dl_config.learning_rate, bert_name_or_path=bert_name)
    #model = Bert_Sequence_FE(dl_config, learning_rate=dl_config.learning_rate, bert_name_or_path=bert_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_idx (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_segments (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 512, 768), ( 109482240   input_idx[0][0]                  
                                                                 input_masks[0][0]            

In [23]:
history = model.fit(x_train, y_train,
                    epochs=dl_config.epochs,
                    batch_size=dl_config.batch_size,
                    validation_data=(x_val, y_val),
                    callbacks=keras_callbacks)

if dl_config.keras_callbacks:
    model.load_weights(checkpoint_path)

Epoch 1/3
INFO:tensorflow:batch_all_reduce: 198 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/tas

## Evaluation

In [None]:
#train_loss, dl_config.train_acc = model.evaluate(x_train, y_train, verbose=0)

In [None]:
#print('Training Loss: %.3f' % (train_loss))
#print('Training Accuracy: %.3f' % (dl_config.train_acc))


plot_training_history(history_dict = history, dl_config=dl_config)

# Test Set

### Load Data

In [None]:
docs_test = bioc_to_docs(test_dataset_path, dl_config=dl_config)
relevances_test = bioc_to_relevances(test_dataset_path, 'protein-protein')

x_test_df = docs_to_pandasdocs(docs_test)
y_test_df = relevances_to_pandas(x_test_df, relevances_test)

## Preprocessing

In [None]:
x_test, y_test = Bert_preprocessing(x_test_df, y_test_df, dl_config,
                                    nmr_sentences=dl_config.nmr_sentences)

### Predictions

In [None]:
yhat_probs = model.predict(x_test, verbose=0)
yhat_probs = yhat_probs[:, 0]

yhat_classes = np.where(yhat_probs > 0.5, 1, yhat_probs)
yhat_classes = np.where(yhat_classes < 0.5, 0, yhat_classes).astype(np.int64)

## Evaluation - Test Set

### ROC and Precision-Recall curves

In [None]:
dl_config.test_roc_auc, dl_config.test_pr_auc = plot_roc_n_pr_curves(y_test, yhat_probs,dl_config = dl_config)

In [None]:
dl_config.test_avg_prec  = average_precision(y_test_df, yhat_probs)
print('Average Precision: %f' % dl_config.test_avg_prec)

# accuracy: (tp + tn) / (p + n)
dl_config.test_acc = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % dl_config.test_acc)

# precision tp / (tp + fp)
dl_config.test_prec = precision_score(y_test, yhat_classes)
print('Precision: %f' % dl_config.test_prec)

# recall: tp / (tp + fn)
dl_config.test_recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % dl_config.test_recall)

# f1: 2 tp / (2 tp + fp + fn)
dl_config.test_f1_score = f1_score(y_test, yhat_classes)
print('F1 score: %f' % dl_config.test_f1_score)

# ROC AUC
print('ROC AUC: %f' % dl_config.test_roc_auc)

# PR AUC
print('PR AUC: %f' % dl_config.test_pr_auc)

# kappa
dl_config.test_kappa = cohen_kappa_score(y_test, yhat_classes)
print('Cohens kappa: %f' % dl_config.test_kappa)

dl_config.test_mcc = matthews_corrcoef(y_test, yhat_classes)
print('MCC: %f' % dl_config.test_mcc)

# confusion matrix
matrix = confusion_matrix(y_test, yhat_classes)
print('Confusion Matrix:\n %s \n' % matrix)

dl_config.test_true_neg, dl_config.test_false_pos, dl_config.test_false_neg, dl_config.test_true_pos = confusion_matrix(
                                                                                                y_test, yhat_classes).ravel()

## Model ID

In [None]:
dl_config.model_id

### Save DL_Config

In [None]:
dl_config.save()

In [None]:
dl_config.path

### Write Results

In [None]:
dl_config.write_report()

## Model Save

In [None]:
#model.save(dl_config.model_id_path / 'model_tf', save_format = 'tf')

In [None]:
from IPython.lib.display import Audio
import numpy as np

framerate = 4410
play_time_seconds = 1

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)