In [1]:
import os

os.chdir('/scratch1/aalamel')

os.getcwd()

'/scratch1/aalamel'

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
from transformers import (
		AutoModel, 
		BertTokenizerFast, 
		BertTokenizer, 
		BertForSequenceClassification, 
		AdamW, 
		BertConfig,
		get_linear_schedule_with_warmup,
    )

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import evaluate
from sklearn.utils import shuffle

import time
import datetime
import math
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import random
from torch.utils.data import TensorDataset, random_split

In [3]:
data = pd.read_csv('pubmed.csv')

In [4]:
data.shape

(6726, 2)

In [5]:
data['label'].value_counts()

0    4458
1    2268
Name: label, dtype: int64

In [6]:
#activate this cell in case you will work with GPU

In [7]:
#import tensorflow as tf

# Get the GPU device name.
#device_name = tf.test.gpu_device_name()

# The device name should look like the following:
#if device_name == '/device:GPU:0':
    #print('Found GPU at: {}'.format(device_name))
#else:
    #raise SystemError('GPU device not found')

In [8]:
#works with any BERT family models except DistilBERT, need a small modifications

tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

In [9]:
# Function to calculate the accuracy of our predictions vs labels

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [10]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [11]:
# List of proportions to use in each iteration
#dataset ==> 80% train (500) + 20% valid
#{625 / 1250 / 1875 / 2500 / 3750 / 5000}
training_set_proportions = [5000] 

In [None]:
for proportion in training_set_proportions:
    # Tokenize all of the sentences and map the tokens to thier word IDs.

#####################################################################################################################################    
################################################# Preparing the Data
#####################################################################################################################################    
	#creating lists to store the metrics for all iterations in one dataframe for each
	mcc_result = []
	glue_result = []
	auc_result = []

	proportion = int(proportion)  
	data_test = data[data['label'] == 1]  

	num = 25
	for _ in range (num) :

		print(f"Iteration Number {_}"+ '\n')    
        
		data = shuffle(data)   

		test_data = data_test.sample(n = 800)    
      
		indices = list(test_data.index.values)        
		train_data = data[~ data.index.isin(indices)] 
		train_valid_data = train_data.sample(n = proportion)           
#####################################################################################################################################    
################################################# Preparing the Data
#####################################################################################################################################


#####################################################################################################################################    
################################################# Start of Training   
#####################################################################################################################################
    
		sentences = train_valid_data.text.values
		labels = train_valid_data.label.values  
        
		input_ids = []
		attention_masks = []
        
		# For every sentence...
		for sent in sentences:
		# `encode_plus` will:
		#   (1) Tokenize the sentence.
		#   (2) Prepend the `[CLS]` token to the start.
		#   (3) Append the `[SEP]` token to the end.
		#   (4) Map tokens to their IDs.
		#   (5) Pad or truncate the sentence to `max_length`
		#   (6) Create attention masks for [PAD] tokens.
			encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        truncation = True,
                        return_tensors = 'pt')     # Return pytorch tensors.
        
    
			# Add the encoded sentence to the list.    
			input_ids.append(encoded_dict['input_ids'])
        
    
			# And its attention mask (simply differentiates padding from non-padding).
			attention_masks.append(encoded_dict['attention_mask'])
        
		# Convert the lists into tensors.
		input_ids = torch.cat(input_ids, dim=0)
		attention_masks = torch.cat(attention_masks, dim=0)
		labels = torch.tensor(labels)

		# Combine the training inputs into a TensorDataset.
		dataset = TensorDataset(input_ids, attention_masks, labels)
		print(f"Running on {proportion} of data set..."+ '\n')
    
		# Divide the dataset by randomly selecting samples.       
		train_size = int(0.8 * len(dataset))
		val_size = len(dataset) - train_size    
        
		train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
		print('{:>5,} training samples'.format(train_size)+'\n')
		print('{:>5,} validation samples'.format(val_size)+'\n')

		# The DataLoader needs to know our batch size for training, so we specify it 
		# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
		# size of 16 or 32.
		batch_size = 32

		# Create the DataLoaders for our training and validation sets.
		# We'll take training samples in random order. 
		train_dataloader = DataLoader(
					train_dataset,  # The training samples.
					sampler = RandomSampler(train_dataset), # Select batches randomly
					batch_size = batch_size  # Trains with this batch size.
				)

		# For validation the order doesn't matter, so we'll just read them sequentially.
		validation_dataloader = DataLoader(
					val_dataset,  # The validation samples.
					sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
					batch_size = batch_size # Evaluate with this batch size.
				)

		# Load BertForSequenceClassification, the pretrained BERT model with a single 
		# linear classification layer on top. 
		model = BertForSequenceClassification.from_pretrained(
			"emilyalsentzer/Bio_ClinicalBERT",
			num_labels = 2, # The number of output labels--2 for binary classification.
						# You can increase this for multi-class tasks.   
			output_attentions = False, # Whether the model returns attentions weights.
			output_hidden_states = False, # Whether the model returns all hidden-states.
		)

		#device = "cuda:0" if torch.cuda.is_available() else "cpu"    #<---------------------------- activate 

		# Tell pytorch to run this model on the GPU.
		#model.to(device)       #<---------------------------- activate 

		# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
		# I believe the 'W' stands for 'Weight Decay fix"

		optimizer = AdamW(model.parameters(),
						lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
						eps = 1e-8) # args.adam_epsilon  - default is 1e-8.

############################################################################  
		# Number of training epochs. The BERT authors recommend between 2 and 4. 
		epochs = 4
############################################################################

		# Total number of training steps is [number of batches] x [number of epochs]. 
		# (Note that this is not the same as the number of training samples).
		total_steps = len(train_dataloader) * epochs

		# Create the learning rate scheduler.
		scheduler = get_linear_schedule_with_warmup(optimizer, 
													num_warmup_steps = 0, # Default value in run_glue.py
													num_training_steps = total_steps)

		# This training code is based on the `run_glue.py` script here:
		# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

		# Set the seed value all over the place to make this reproducible.
		seed_val = 42

		random.seed(seed_val)
		np.random.seed(seed_val)
		torch.manual_seed(seed_val)
		torch.cuda.manual_seed_all(seed_val)

		# We'll store a number of quantities such as training and validation loss, 
		# validation accuracy, and timings.
		training_stats = []

		# Measure the total training time for the whole run.
		total_t0 = time.time()

		# For each epoch...
		#print(train_dataset[0])
    
		for epoch_i in range(0, epochs):
		
		# ========================================
		#               Training
		# ========================================
		
		# Perform one full pass over the training set.

			print(""+'\n')
			print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)+'\n')
			print('Training...'+'\n')

			# Measure how long the training epoch takes.
			t0 = time.time()

			# Reset the total loss for this epoch.
			total_train_loss = 0

			# Put the model into training mode. Don't be mislead--the call to 
			# `train` just changes the *mode*, it doesn't *perform* the training.
			# `dropout` and `batchnorm` layers behave differently during training
			# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
			model.train()

			# For each batch of training data...
			for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):

			# Progress update every 40 batches.
				if step % 40 == 0 and not step == 0:
                    
					# Calculate elapsed time in minutes.
					elapsed = format_time(time.time() - t0)

					# Report progress.
					print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

			# Unpack this training batch from our dataloader. 
			# As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
			# `batch` contains three pytorch tensors:
			#   [0]: input ids 
			#   [1]: attention masks
			#   [2]: labels             
				b_input_ids = batch[0]#.to(device)     #<---------------------------- activate 
				b_input_mask = batch[1]#.to(device)     #<---------------------------- activate 
				b_labels = batch[2]#.to(device)     #<---------------------------- activate 

				# Always clear any previously calculated gradients before performing a
				# backward pass. PyTorch doesn't do this automatically because 
				# accumulating the gradients is "convenient while training RNNs". 
				# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
				model.zero_grad()        

				# Perform a forward pass (evaluate the model on this training batch).
				# The documentation for this `model` function is here: 
				# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
				# It returns different numbers of parameters depending on what arguments
				# arge given and what flags are set. For our useage here, it returns
				# the loss (because we provided labels) and the "logits"--the model
				# outputs prior to activation. 
				outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
				loss = outputs[0]
				logits = outputs[1]

				# Accumulate the training loss over all of the batches so that we can
				# calculate the average loss at the end. `loss` is a Tensor containing a
				# single value; the `.item()` function just returns the Python value 
				# from the tensor.
				total_train_loss += loss.item()
				outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
				loss = outputs[0]
				logits = outputs[1]

				# Perform a backward pass to calculate the gradients.
				loss.backward()

				# Clip the norm of the gradients to 1.0.
				# This is to help prevent the "exploding gradients" problem.
				torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

				# Update parameters and take a step using the computed gradient.
				# The optimizer dictates the "update rule"--how the parameters are
				# modified based on their gradients, the learning rate, etc.
				optimizer.step()

				# Update the learning rate.
				scheduler.step()

		# Calculate the average loss over all of the batches.
			avg_train_loss = total_train_loss / len(train_dataloader)            

		# Measure how long this epoch took.
			training_time = format_time(time.time() - t0)

			print("")
			print("  Average training loss: {0:.2f}".format(avg_train_loss)+'\n')
			print("  Training epcoh took: {:}".format(training_time)+'\n')

		# ========================================
		#               Validation
		# ========================================
		# After the completion of each training epoch, measure our performance on
		# our validation set.

			print("")
			print("Running Validation..."+'\n')

			t0 = time.time()

			# Put the model in evaluation mode--the dropout layers behave differently
			# during evaluation.
			model.eval()

			# Tracking variables 
			total_eval_accuracy = 0
			total_eval_loss = 0
			nb_eval_steps = 0

			# Evaluate data for one epoch
			for batch in tqdm(validation_dataloader, desc="Iteration"):
			
			# Unpack this training batch from our dataloader. 
			#
			# As we unpack the batch, we'll also copy each tensor to the GPU using the `to` method.
			# `batch` contains three pytorch tensors:
			#   [0]: input ids 
			#   [1]: attention masks
			#   [2]: labels 
				b_input_ids = batch[0]#.to(device)    #<---------------------------- activate 
				b_input_mask = batch[1]#.to(device)    #<---------------------------- activate 
				b_labels = batch[2]#.to(device)    #<---------------------------- activate 

				# Tell pytorch not to bother with constructing the compute graph during
				# the forward pass, since this is only needed for backprop (training).
				with torch.no_grad():        

				# Forward pass, calculate logit predictions.
				# token_type_ids is the same as the "segment ids", which 
				# differentiates sentence 1 and 2 in 2-sentence tasks.
				# The documentation for this `model` function is here: 
				# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
				# Get the "logits" output by the model. The "logits" are the output
				# values prior to applying an activation function like the softmax.

					outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
					loss = outputs[0]
					logits = outputs[1]
				# Accumulate the validation loss.
				total_eval_loss += loss.item()

				# Move logits and labels to CPU
				logits = logits.detach().cpu().numpy()
				label_ids = b_labels.to('cpu').numpy()

				# Calculate the accuracy for this batch of test sentences, and
				# accumulate it over all batches.
				total_eval_accuracy += flat_accuracy(logits, label_ids)


			# Report the final accuracy for this validation run.
			avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
			print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

			# Calculate the average loss over all of the batches.
			avg_val_loss = total_eval_loss / len(validation_dataloader)

			# Measure how long the validation run took.
			validation_time = format_time(time.time() - t0)

			print("  Validation Loss: {0:.2f}".format(avg_val_loss))
			print("  Validation took: {:}".format(validation_time))

			# Record all statistics from this epoch.
			training_stats.append(
				{
					'epoch': epoch_i + 1,
					'Training Loss': avg_train_loss,
					'Valid. Loss': avg_val_loss,
					'Valid. Accur.': avg_val_accuracy,
					'Training Time': training_time,
					'Validation Time': validation_time})

		print("")
		print("Training complete!"+'\n')

		print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))+'\n')
#####################################################################################################################################    
################################################# End of Training   
#####################################################################################################################################


#####################################################################################################################################    
################################################# Saving the Training Metrics 
#####################################################################################################################################
		# Display floats with two decimal places.
		pd.options.display.precision   

		# Create a DataFrame from our training statistics.
		df_stats = pd.DataFrame(data=training_stats)

		# Use the 'epoch' as the row index.
		df_stats = df_stats.set_index('epoch')

		# Display the table.

		print(f"Metrics of{train_size} of the training data set"+'\n')
		print(df_stats)
    
		df_stats.to_csv('Bio_ClinicalBERT_Metrics_of_'+str(train_size)+"_iteration_"+str(_)+'.csv')
#####################################################################################################################################    
################################################# Saving the Training Process  
#####################################################################################################################################    


#####################################################################################################################################    
################################################# Preparing Test data for Testing    
#####################################################################################################################################
		print(len(test_data))
		sentences = test_data.text.values
		labels = test_data.label.values    
    
		# Report the number of sentences.
		#print('Number of test sentences: {:,}\n'.format(df.shape[0]))

		# Tokenize all of the sentences and map the tokens to thier word IDs.
		input_ids = []
		attention_masks = []

		# For every sentence...
		for sent in sentences:
		# `encode_plus` will:
		#   (1) Tokenize the sentence.
		#   (2) Prepend the `[CLS]` token to the start.
		#   (3) Append the `[SEP]` token to the end.
		#   (4) Map tokens to their IDs.
		#   (5) Pad or truncate the sentence to `max_length`
		#   (6) Create attention masks for [PAD] tokens.
			encoded_dict = tokenizer.encode_plus(
							sent,                      # Sentence to encode.
							add_special_tokens = True, # Add '[CLS]' and '[SEP]'
							max_length = 512,           # Pad & truncate all sentences.
							pad_to_max_length = True,
							return_attention_mask = True,   # Construct attn. masks.
							truncation = True,
							return_tensors = 'pt')     # Return pytorch tensors.

			# Add the encoded sentence to the list.    
			input_ids.append(encoded_dict['input_ids'])

			# And its attention mask (simply differentiates padding from non-padding).
			attention_masks.append(encoded_dict['attention_mask'])

		# Convert the lists into tensors.
		input_ids = torch.cat(input_ids, dim=0)
		attention_masks = torch.cat(attention_masks, dim=0)
		labels = torch.tensor(labels)

		# Set the batch size.  
		batch_size = 32  

		# Create the DataLoader.
		prediction_data = TensorDataset(input_ids, attention_masks, labels)
		prediction_sampler = SequentialSampler(prediction_data)
		prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
#####################################################################################################################################    
################################################# Preparing Test data for Testing    
#####################################################################################################################################


#####################################################################################################################################    
################################################# Start of Testing    
#####################################################################################################################################
		print('Predicting labels for {:,} test sentences...'.format(len(input_ids))+'\n')

		# Put model in evaluation mode
		model.eval()

		# Tracking variables 
		predictions , true_labels = [], []
		prob = []
        
		# Predict 
		for batch in tqdm(prediction_dataloader):
            
			# Add batch to GPU if you have by moving device between the ()
			batch = tuple(t.to() for t in batch) #device   #<---------------------------- activate ( to(device) )

			# Unpack the inputs from our dataloader
			b_input_ids, b_input_mask, b_labels = batch

			# Telling the model not to compute or store gradients, saving memory and 
			# speeding up prediction
			with torch.no_grad():
                
				# Forward pass, calculate logit predictions
				outputs = model(b_input_ids, token_type_ids=None, 
								attention_mask=b_input_mask)

			logits = outputs[0]

			# Move logits and labels to CPU
			logits = logits.detach().cpu().numpy()
			label_ids = b_labels.to('cpu').numpy()

			# Store the logits as predictions 
			predictions.append(logits)
            
			# Store the true labels            
			true_labels.append(label_ids)           
        
			# Calculate the probabilitites from the logits and store them    
			probabilities = tf.math.sigmoid(logits)        
			prob.append(probabilities)        
            
		print('..........DONE..........'+'\n')        
#####################################################################################################################################    
################################################# End of Testing    
#####################################################################################################################################


#####################################################################################################################################    
################################################# Caluclating MCC   
#####################################################################################################################################
		#not working with one class prediction
		print('Positive samples: %d of %d (%.2f%%)' % (data.label.sum(), len(data.label), (data.label.sum() / len(data.label) * 100.0)))
		matthews_set = []

        
		# Evaluate each test batch using Matthew's correlation coefficient
		print('Calculating Matthews Corr. Coef. for each batch...'+'\n')

	# For each input batch...
		for i in range(len(true_labels)):

			# The predictions for this batch are a 2-column ndarray (one column for "0" 
			# and one column for "1"). Pick the label with the highest value and turn this
			# in to a list of 0s and 1s.
			pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
            
			# Calculate and store the coef for this batch.  
			matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
			matthews_set.append(matthews)

		# Combine the results across all batches. 
		flat_predictions = np.concatenate(predictions, axis=0)  
        
		# For each sample, pick the label (0 or 1) with the higher score.
		final_predictions = np.argmax(flat_predictions, axis=1).flatten() 
        
		# Combine the correct labels for each batch into a single list.
		flat_true_labels = np.concatenate(true_labels, axis=0)               
        
		# Calculate the MCC
		mcc = matthews_corrcoef(flat_true_labels, final_predictions)        
		print('Total MCC: %.3f' % mcc)
        
		# We need to save the output in dataframe for the whole 25 runs 
		mcc_result.append(float(mcc))           
#####################################################################################################################################    
################################################# Caluclating MCC   
#####################################################################################################################################   


#####################################################################################################################################    
################################################# Retreiving the Probabilities for Each Iteration   
#####################################################################################################################################
		# Combine the probabilities across all batches. 
		prob_predictions = np.concatenate(prob, axis=0)  
        
		# For each sample, pick the higher in score.
		prob_predictions = np.max(prob_predictions, axis=1).flatten()  
        
		# Storing the the propalities in dataframe    
		probability_df = pd.DataFrame({
                                       "True Labels" :flat_true_labels,      
                                       "Predicted Labels" :final_predictions,
                                       "Probabilities of Predicted Labels" :prob_predictions})    
        
		probability_df.to_csv('Bio_ClinicalBERT_Probabilities_'+str(train_size)+"_iteration_"+str(_)+'.csv', index = False)         
#####################################################################################################################################    
################################################# Retreiving the Probabilities for Each Iteration 
#####################################################################################################################################


#####################################################################################################################################    
################################################# Caluclating GLUE  
##################################################################################################################################### 
		metric = evaluate.load("glue", "mrpc")
		glue = metric.compute(predictions= final_predictions, references=flat_true_labels) 
           
		print(f"GLUE score of {train_size} training data for iteration {_}: ", glue)
        
		#we need to save the output in dataframe for the whole 25 runs 
		glue_result.append(glue)                
#####################################################################################################################################    
################################################# Caluclating GLUE  
#####################################################################################################################################      


#####################################################################################################################################    
################################################# Caluclating AUC
##################################################################################################################################### 
		#not working with one class prediction
		#auc = roc_auc_score(flat_true_labels, final_predictions)       
		#print(f"ROC AUC score of {train_size} training data for iteration {_}: ", auc)  
        
		#we need to save the output in dataframe for the whole 25 runs 
		#auc_result.append(float(auc))         
#####################################################################################################################################    
################################################# Caluclating AUC
##################################################################################################################################### 


#####################################################################################################################################    
################################################# Saving Metrics in Dataframe for each Sample Size
#####################################################################################################################################   
	#for MCC
	#MCC_df = pd.DataFrame(mcc_result, columns = ['MCC'])
	#MCC_df.to_csv('BERT_MCC_'+str(train_size)+'.csv', index = False)   

	#for GLUE    
	GLUE_df = pd.DataFrame(glue_result, columns = ['accuracy', 'f1'])      
	GLUE_df.to_csv('Bio_ClinicalBERT_GLUE_'+str(train_size)+'.csv', index = False)  

	#for AUC
	#AUC_df = pd.DataFrame(auc_result, columns = ['AUC'])
	#AUC_df.to_csv('BERT_AUC_'+str(train_size)+'.csv', index = False) 
#####################################################################################################################################    
################################################# Saving Metrics in Dataframe for each Sample Size
#####################################################################################################################################                                     

Iteration Number 0





Running on 5000 of data set...

4,000 training samples

1,000 validation samples



Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model




Training...



Iteration:  32%|███▏      | 40/125 [08:57<18:11, 12.84s/it]

  Batch    40  of    125.    Elapsed: 0:08:57.


Iteration:  64%|██████▍   | 80/125 [17:40<10:49, 14.42s/it]

  Batch    80  of    125.    Elapsed: 0:17:41.


Iteration:  96%|█████████▌| 120/125 [26:30<01:03, 12.65s/it]

  Batch   120  of    125.    Elapsed: 0:26:31.


Iteration: 100%|██████████| 125/125 [27:34<00:00, 13.23s/it]



  Average training loss: 0.24

  Training epcoh took: 0:27:34


Running Validation...



Iteration: 100%|██████████| 32/32 [01:01<00:00,  1.91s/it]


  Accuracy: 0.96
  Validation Loss: 0.13
  Validation took: 0:01:01



Training...



Iteration:  32%|███▏      | 40/125 [08:31<18:21, 12.96s/it]

  Batch    40  of    125.    Elapsed: 0:08:32.


Iteration:  64%|██████▍   | 80/125 [17:10<09:26, 12.60s/it]

  Batch    80  of    125.    Elapsed: 0:17:11.


Iteration:  96%|█████████▌| 120/125 [25:36<01:05, 13.14s/it]

  Batch   120  of    125.    Elapsed: 0:25:37.


Iteration: 100%|██████████| 125/125 [26:38<00:00, 12.79s/it]



  Average training loss: 0.12

  Training epcoh took: 0:26:39


Running Validation...



Iteration: 100%|██████████| 32/32 [01:00<00:00,  1.90s/it]


  Accuracy: 0.97
  Validation Loss: 0.10
  Validation took: 0:01:01



Training...



Iteration:  32%|███▏      | 40/125 [08:28<18:09, 12.82s/it]

  Batch    40  of    125.    Elapsed: 0:08:28.


Iteration:  64%|██████▍   | 80/125 [17:07<10:36, 14.15s/it]

  Batch    80  of    125.    Elapsed: 0:17:07.


Iteration:  96%|█████████▌| 120/125 [25:16<01:03, 12.72s/it]

  Batch   120  of    125.    Elapsed: 0:25:16.


Iteration: 100%|██████████| 125/125 [26:19<00:00, 12.63s/it]



  Average training loss: 0.09

  Training epcoh took: 0:26:19


Running Validation...



Iteration: 100%|██████████| 32/32 [01:06<00:00,  2.08s/it]


  Accuracy: 0.96
  Validation Loss: 0.11
  Validation took: 0:01:07



Training...



Iteration:  32%|███▏      | 40/125 [08:18<16:39, 11.76s/it]

  Batch    40  of    125.    Elapsed: 0:08:19.


Iteration:  64%|██████▍   | 80/125 [16:49<08:58, 11.96s/it]

  Batch    80  of    125.    Elapsed: 0:16:50.


Iteration:  96%|█████████▌| 120/125 [25:19<00:58, 11.68s/it]

  Batch   120  of    125.    Elapsed: 0:25:20.


Iteration: 100%|██████████| 125/125 [26:25<00:00, 12.69s/it]



  Average training loss: 0.07

  Training epcoh took: 0:26:26


Running Validation...



Iteration: 100%|██████████| 32/32 [01:00<00:00,  1.90s/it]


  Accuracy: 0.97
  Validation Loss: 0.10
  Validation took: 0:01:01

Training complete!

Total training took 1:51:08 (h:mm:ss)

Metrics of4000 of the training data set

       Training Loss  Valid. Loss  Valid. Accur. Training Time Validation Time
epoch                                                                         
1           0.236475     0.129589       0.958984       0:27:34         0:01:01
2           0.123799     0.100925       0.970703       0:26:39         0:01:01
3           0.093389     0.114627       0.961914       0:26:19         0:01:07
4           0.069561     0.098223       0.966797       0:26:26         0:01:01
800
Predicting labels for 800 test sentences...



  0%|          | 0/25 [00:00<?, ?it/s]2023-01-03 14:58:05.955695: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-03 14:58:08.454424: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13697 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
100%|██████████| 25/25 [00:49<00:00,  1.97s/it]


..........DONE..........

Positive samples: 2268 of 6726 (33.72%)
Calculating Matthews Corr. Coef. for each batch...

Total MCC: 0.000
GLUE score of 4000 training data for iteration 0:  {'accuracy': 0.925, 'f1': 0.961038961038961}
Iteration Number 1





Running on 5000 of data set...

4,000 training samples

1,000 validation samples



Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model




Training...



Iteration:  32%|███▏      | 40/125 [08:42<18:37, 13.14s/it]

  Batch    40  of    125.    Elapsed: 0:08:43.


Iteration:  64%|██████▍   | 80/125 [17:27<09:37, 12.83s/it]

  Batch    80  of    125.    Elapsed: 0:17:27.


Iteration:  96%|█████████▌| 120/125 [26:15<01:10, 14.09s/it]

  Batch   120  of    125.    Elapsed: 0:26:15.


Iteration: 100%|██████████| 125/125 [27:15<00:00, 13.09s/it]



  Average training loss: 0.26

  Training epcoh took: 0:27:16


Running Validation...



Iteration: 100%|██████████| 32/32 [00:56<00:00,  1.78s/it]


  Accuracy: 0.96
  Validation Loss: 0.13
  Validation took: 0:00:57



Training...



Iteration:  32%|███▏      | 40/125 [08:34<18:02, 12.74s/it]

  Batch    40  of    125.    Elapsed: 0:08:34.


Iteration:  64%|██████▍   | 80/125 [17:07<09:34, 12.77s/it]

  Batch    80  of    125.    Elapsed: 0:17:07.


Iteration:  96%|█████████▌| 120/125 [25:50<01:06, 13.27s/it]

  Batch   120  of    125.    Elapsed: 0:25:51.


Iteration: 100%|██████████| 125/125 [26:55<00:00, 12.93s/it]



  Average training loss: 0.13

  Training epcoh took: 0:26:56


Running Validation...



Iteration: 100%|██████████| 32/32 [01:04<00:00,  2.01s/it]


  Accuracy: 0.96
  Validation Loss: 0.12
  Validation took: 0:01:04



Training...



Iteration:  32%|███▏      | 40/125 [08:45<18:33, 13.10s/it]

  Batch    40  of    125.    Elapsed: 0:08:46.


Iteration:  64%|██████▍   | 80/125 [17:22<10:27, 13.95s/it]

  Batch    80  of    125.    Elapsed: 0:17:22.


Iteration:  85%|████████▍ | 106/125 [22:51<03:54, 12.32s/it]

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
# Make new save directory for each model (i.e. proportion size)
#output_dir = './model_save_/'

# Create output directory if needed
#if not os.path.exists(output_dir):
#os.makedirs(output_dir)

#print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
#model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
#model_to_save.save_pretrained(output_dir)
#tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

# !ls -l --block-size=K ./model_save/

# !ls -l --block-size=M ./model_save/pytorch_model.bin

# Load a trained model and vocabulary that you have fine-tuned
# model = model_class.from_pretrained(output_dir)
# tokenizer = tokenizer_class.from_pretrained(output_dir)

# # Copy the model to the GPU.
# model.to(device) #<---------------------------- activate 