In [None]:
!pip install transformers

In [None]:
!nvidia-smi

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
!cat /proc/meminfo

In [None]:
import torch
if torch.cuda.is_available():

  device = torch.device("cuda")
  print(torch.cuda.device_count())
  print(torch.cuda.get_device_name(0))

else:
  device = torch.device("cpu")


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
#import pytorch_lightning as pl
#from pytorch_lightning.metrics.functional.classification import auroc
#from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

In [None]:
txt_path='/content/drive/My Drive/Colab Notebooks/AnonymizedClinicalAbbreviationsAndAcronymsDataSet.txt'
data_pd=pd.read_csv( txt_path, sep="|", header=None,encoding='cp1252',)
data_pd = data_pd.rename(columns = { 0: 'Abbreviation', 1 : 'Expansion', 2 :'ABB_frm' , 3 : "start_pos", 4: "end_pos", 5 : "info", 6: "context"}, inplace = False)
print(data_pd.shape)
#data_pd.info()

In [None]:
data_pd=data_pd[data_pd['Abbreviation'] == 'AB']
print(data_pd.shape)

In [None]:
#check number of expansion in the dataset before filtering
print(len(data_pd["Expansion"].unique()))

#filtering dataset 
data_pd.drop(data_pd[data_pd['Expansion'] == "UNSURED SENSE"].index, inplace = True)
data_pd.drop(data_pd[data_pd['Expansion'] == "GENERAL ENGLISH"].index, inplace = True)
data_pd.drop(data_pd[data_pd['Expansion'] == "NAME"].index, inplace = True)

# check dataset size after filtering
print(data_pd.shape)

In [None]:
# fuction to assign numerical value to the expansion
def func(unique_expansion, ex):
  ex= str(ex)
  for i in unique_expansion.items():
    if i[0] == ex:
      return i[1]


In [None]:
Expansion_unique = data_pd["Expansion"].unique()
context = data_pd["context"].values

num= np.arange(0, 348, 1).tolist()
unique_expansion = dict(zip(Expansion_unique,num))

all_label = []

for index, row in data_pd.iterrows():
  for i in unique_expansion.items():
    if i[0] == row[1]:
      all_label. append(i[1]) 

In [None]:
word2index_dict = {word: i for (i, word) in enumerate(Expansion_unique)}

In [None]:

label_df = pd.DataFrame(list(word2index_dict.items()),columns = ['expansion','label'])
label_df.to_csv('label_expansion.tsv', sep = '|')

In [None]:
# function to clean the text
import re
import string
def cleaning(context_):
  x=re.sub("_%#\S+", "", context_)
  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
  x = re.sub(r'\w*\d+\w*','', x)
  x = re.sub('\s{2,}', " ", x)
  return x


In [None]:
#filtered dataset after cleaning text and labeled expansion
filtered_data = pd.DataFrame()
filtered_data['content'] = data_pd['context']#.apply(cleaning)
filtered_data['expansion'] = data_pd['Expansion']
filtered_data['label'] = all_label

In [None]:
filtered_data['label'][5000]

In [None]:
#override dataset
class DisambiguateDataset(Dataset):

  def __init__(self, data: pd.DataFrame, tokenizer, max_token_len ):

    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return self.data.shape[0]

  def __getitem__(self, index: int):

    data_row = self.data.iloc[index]
    context =data_row['content']
    expansion = data_row['expansion']
    label_ = data_row['label']

    encoding = self.tokenizer.encode_plus(
        expansion, context, 
        #text_pair = expansion,
        add_special_tokens = True,
        max_length = 128,
        return_token_type_ids = True,
        padding = "max_length",
        truncation =True,
        return_attention_mask = True,
        return_tensors = "pt"
     
    )
    return dict(
      input_ids = encoding["input_ids"].flatten(),
      attention_mask = encoding["attention_mask"].flatten(),
      token_type_ids = encoding["token_type_ids"].flatten(),
      label_ = torch.tensor(label_,dtype = torch.long)
    )



In [None]:
RANDOM_SEED = 42
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 2

# divide the data set to training and validation dataset and check the ne w size for both
train_df, test_df = train_test_split(filtered_data,test_size=0.2, random_state = RANDOM_SEED)
val_df, test_df = train_test_split(test_df,test_size=0.5, random_state = RANDOM_SEED)
train_df.shape, val_df.shape, test_df.shape


((29560, 3), (3695, 3), (3695, 3))

In [None]:
test_df.columns

Index(['content', 'expansion', 'label'], dtype='object')

In [None]:
filtered_df.head(5)

In [None]:
# Download Bio_clinicalBERT 

from transformers import AutoTokenizer, AutoModel, BertForNextSentencePrediction, BertTokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
#Download ms_bert

from transformers import AutoTokenizer, AutoModel
model_name = "NLP4H/ms_bert"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = DisambiguateDataset(df, tokenizer, max_len)

  return torch.utils.data.DataLoader(ds, batch_size = batch_size, shuffle = True )  

In [None]:
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(val_data_loader))
data.keys()
  

In [None]:
out = clinic_model(data['input_ids'].to(device), data['attention_mask'].to(device), data['token_type_ids'].to(device))
out[0]

In [None]:
correct_predictions = 0
_, preds = torch.max(out, dim=1)
preds[0] , data['label_']
loss = loss_fn(out,data['label_'].to(device))
loss
#correct_predictions += torch.sum(preds == data['label_'].to(device))
#correct_predictions
#preds[0], data['label_'].shape

In [None]:
out['pooler_output'].shape , out['hidden_states'][0].shape

In [None]:
class DisambiguateClassifier(nn.Module):

  def __init__(self, n_classes):
    super(DisambiguateClassifier, self).__init__()
    self.model = model
    self.linear_relu_stack = nn.Sequential(
     nn.Linear(self.model.config.hidden_size, 512),
     nn.ReLU(),
    nn.Linear(512, 348),
    )
    
  
  
  def forward(self, input_ids, token_type_ids, attention_mask ):
    output = self.model(
        input_ids,
       token_type_ids, 
       attention_mask,
      
    
    )
    out = self.linear_relu_stack (output['last_hidden_state'][:,0,:])
    return out
    


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
clinic_model = DisambiguateClassifier(model)
clinic_model.to(device)

In [None]:
class DisambiguateClassifier(nn.Module):
  def __init__(self, model):
    super(DisambiguateClassifier, self).__init__()
    self.model = model
    self.linear_relu_stack = nn.Sequential(
        nn.Linear(768, 512),
        nn.Dropout(.3),
        nn.ReLU(),
        nn.Linear(512, 340),
        nn.Dropout(.3),
        nn.ReLU(),
        nn.Linear(340, 498),
        #nn.Softmax(dim=1)
        )
    
  def forward(self, input_ids, token_type_ids, attention_mask ):
    output = self.model(
        input_ids = input_ids,
        token_type_ids = token_type_ids, 
        attention_mask =  attention_mask
    )
    out = self.linear_relu_stack(output['last_hidden_state'][:,0,:])
    return out


        
       
       

In [None]:
optimizer = AdamW(clinic_model.parameters(), lr = 1e-5 )
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
 model = model.train()

 losses = []
 correct_predictions = 0

 for d in data_loader:
   input_ids = d['input_ids'].to(device)
   attention_mask = d['attention_mask'].to(device)
   token_type_ids = d['token_type_ids'].to(device)
   label = d['label_'].to(device)

   outputs = model(
       input_ids,
       attention_mask,
      token_type_ids,
   )

   _, preds = torch.max(outputs, dim=1)
   loss = loss_fn(outputs,label)

   correct_predictions += torch.sum(preds == label)
   losses.append(loss.item())

   loss.backward()
   nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
   optimizer.step()
   scheduler.step()
   optimizer.zero_grad()

 return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):

  model = model.eval()
  losses = []
  correct_predictions = 0
  prediction = []
  with torch.no_grad():

    for d in data_loader:

      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      token_type_ids = d['token_type_ids'].to(device)
      label = d['label_'].to(device)

      outputs = model(
      input_ids,
      attention_mask,
      token_type_ids,
      
   )

      
      _,preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, label)
      for p,x, ii in zip(preds,label,input_ids):
        if p.item() != x.item():
          print(p.item(),",", x.item(),",",tokenizer.decode(ii,skip_special_tokens= True))
       
      correct_predictions += torch.sum(preds == label)

      losses.append(loss.item())
      #prediction = torch.stack(prediction).cpu()
  return correct_predictions.double() / n_examples, np.mean(losses) 

In [None]:
#EPOCHS = 1
from collections import defaultdict
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
history = defaultdict(list)

best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')

  print('-' * 10)

  train_acc, train_loss = train_epoch(
    clinic_model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(train_df)
 )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(

    clinic_model,

    val_data_loader,

    loss_fn,

    device,

    len(val_df)

  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')

  print()

  history['train_acc'].append(train_acc)

  history['train_loss'].append(train_loss)

  history['val_acc'].append(val_acc)

  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:

    torch.save(model.state_dict(), 'best_model_state.bin')

    best_accuracy = val_acc

In [None]:
plt.plot(history['train_acc'], label='train accuracy')

plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')

plt.ylabel('Accuracy')

plt.xlabel('Epoch')

plt.legend()

plt.ylim([0, 1]);

In [None]:
test_acc, _ = eval_model(

  clinic_model,

  test_data_loader,

  loss_fn,

  device,

  len(test_df)

)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):

  model = model.eval()

  text = []

  predictions = []

  real_values = []
  texts = []
  

  with torch.no_grad():

    for d in data_loader:


      input_ids = d["input_ids"].to(device)

      attention_mask = d["attention_mask"].to(device)
      token_type_ids = d["token_type_ids"].to(device)


      targets = d["label_"].to(device)

      outputs = model(

        input_ids=input_ids,

        attention_mask=attention_mask,
        token_type_ids = token_type_ids,

      )

      _, preds = torch.max(outputs, dim=1)

      #for xx in input_ids:
        #text.extend(tokenizer.decode(xx,skip_special_tokens=True))

      predictions.extend(preds)

      real_values.extend(d['label_'])
  texts.extend(text)

  predictions = torch.stack(predictions).cpu()

  #texts = torch.stack(texts).cpu()

  real_values = torch.stack(real_values).cpu()

  return  predictions, real_values

In [None]:
y_pred, y_test = get_predictions(clinic_model,test_data_loader)


In [None]:
text = []
for d in test_data_loader:
  for dd in d['input_ids']:
    text.append(tokenizer.decode(dd, skip_special_tokens=True))

In [None]:
prediction = []
for i in y_pred:
  prediction.append(i.item())
len(prediction)

3695

In [None]:
real = []
for i in y_test:
  real.append(i.item())
len(real)

3695

In [None]:
test_result_df = pd.DataFrame()
test_result_df['text'] = text
test_result_df['prediction'] = prediction
test_result_df['real'] = real


In [None]:
test_result_df.to_csv('test_result_prediction.tsv', sep = '|')

In [None]:
#print(classification_report(y_test, y_pred, target_names=class_names))
#print(y_pred)
for i in y_pred:
  print(i)

In [None]:
d = next(iter(test_data_loader))
print(d.keys())
test_df['label']
all_input = []
for i in test_data_loader:
  input = i['input_ids']
  all_input.extend(input)

all_input=torch.stack(all_input).cpu()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'label_'])


In [None]:
all_input=torch.stack(all_input).cpu()

In [None]:
predicted_label = [73,298,34,107,148,325,201,79,285,63,68,93,233,143,281,85,45,34,117,281,155,298,89,0,344,149]
pred_expansion = []
for i in predicted_label:
  for j, row in label_df.iterrows():
    if row[1] == i:
      pred_expansion. append(row[0])
pred_expansion