# GET BERT EMBEDDINGS
- Get Bert embeddings of extracted terms

In [None]:
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt


import numpy as np 
import pandas as pd 
import os
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, fbeta_score, precision_score, recall_score, accuracy_score
import torch
import re

from transformers import CamembertModel, CamembertTokenizer, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

os.environ['CUDA_VISIBLE_DEVICES'] = "1" 


import torch
from torch.utils.data import Dataset, TensorDataset, RandomSampler, SequentialSampler, DataLoader
from torch.nn import BCEWithLogitsLoss, Sigmoid


from sklearn.model_selection import train_test_split
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device',device)

# the model : 
bert_name = "/export/home/cse200093/word-embedding/training-from-scratch-2021-08-13/"
# bert_name ="/export/home/opt/data/camembert/v0/camembert-base/"

lower = True
batch_size = 32
max_epochs = 50


Max_len = 128 
val_rate = .2
debug = 0

file_path = './out/15_02_22_CRH_VAL_FastText.pkl' 
OUT_FILE = './out/22_02_22_CRH_VAL_FastText_EDS_BERT.pkl' 

In [2]:
# Pre-processing of the text : removing the non-alphanumeric data : 
import unidecode

def strip(text, lower=False):
    # pattern = r"[^a-zA-z0-9\s,']"
    #text = unidecode.unidecode(text)
    text = re.sub(r'^-','',text, count = 1)
    if lower:
        return text.lower()
    else:
        return text


In [3]:
# load the df with fasttext embeddings
df = pd.read_pickle(file_path)
df['term'] = df['term'].apply(strip, args=(lower,)) 
df.sample(5)

Unnamed: 0,source,gender,term,fastext_embeddings,label
23648,CRH_val_sample_130.ann,m,"phénomène de raynaud touchant les deux mains, ...","[13.00034, 9.85812, -14.741971, -18.676361, 21...",cardiovasculaires
21372,CRH_val_sample_7.ann,m,hématome sous-duraux bilatéraux aiguës sur chr...,"[18.716938, -1.0977753, -34.151524, -17.793055...",nerveux
12598,CRH_val_sample_29.ann,f,mauvais contrôle clinico-biologique,"[9.735994, 3.8095098, -10.086082, -15.766283, ...",etatsosy
4138,n_CRH_val_sample_41.ann,m,oma récidivantes,"[6.255917, -0.69583166, -2.5243216, -17.01729,...",etatsosy
18378,CRH_val_sample_54.ann,f,ulcère des membres inférieurs,"[19.14647, -12.786297, -6.4342065, -4.55314, 4...",peau


In [4]:
# load the tokenizer : 
tokenizer = CamembertTokenizer.from_pretrained(bert_name, do_lower_case = lower)
train_texts = list(df['term'])


# tokenization of the train, val and test dataset : 
train_tokenizer_texts = tokenizer(train_texts, return_tensors = 'pt', padding=True, truncation=True, max_length = Max_len)
print('train_size ', train_tokenizer_texts['input_ids'].size())
attention_masks = train_tokenizer_texts['attention_mask']


train_size  torch.Size([24876, 46])


In [47]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [1] * len(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [48]:
# Load pre-trained model (weights)
model = CamembertModel.from_pretrained(bert_name,
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Some weights of CamembertModel were not initialized from the model checkpoint at /export/home/cse200093/word-embedding/training-from-scratch-2021-08-13/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
model.eval()
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)

In [73]:
print ("Number of layers:", len(hidden_states), "(initial embeddings + 12 BERT layers)")

layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings.shape

# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

Number of layers: 13 (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 34
Number of hidden units: 768


torch.Size([13, 34, 768])

In [74]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([34, 13, 768])

In [76]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []# `token_embeddings` is a [22 x 12 x 768] tensor.# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor# Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

Shape is: 34 x 3072


In [78]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))


Shape is: 34 x 768


In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device', device)
batch_size = 16

BERT_embeddings = []

# Load pre-trained model (weights)
model = CamembertModel.from_pretrained(bert_name,
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

model.to(device)

dataset = TensorDataset(train_tokenizer_texts['input_ids'], train_tokenizer_texts['attention_mask'])
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)


model.eval()
for step, batch in enumerate(tqdm(dataloader)): 
    #Transfer batch to GPUs --> not adapted for EDS computation
    batch = tuple(t.to(device) for t in batch)
    #We dont need to update gradients as we are just predicting
    with torch.no_grad():
        #Bring up the next batch of input_texts and attention_masks 
        b_input_ids, b_input_mask = batch
        #Forward propogate the inputs and get output as logits
        outputs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask)
        
        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        
        # To get a single vector for our entire sentence we have 
        # multiple application-dependent strategies, 
        # but a simple approach is to average the second to last 
        # hiden layer of each token producing a single 1024 length vector.
        # token_vecs = hidden_states[-2:][0]
        token_vecs = hidden_states[-2:][0]
        sentence_embedding = torch.mean(token_vecs, dim=1).to('cpu').numpy()
        BERT_embeddings.extend(sentence_embedding)
    


device cuda


Some weights of CamembertModel were not initialized from the model checkpoint at /export/home/cse200093/word-embedding/training-from-scratch-2021-08-13/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1555/1555 [00:57<00:00, 26.98it/s]


In [8]:
token_vecs_2 = hidden_states[-2][0]
print(token_vecs.shape, token_vecs_2.shape)
sentence_embedding_2 = torch.mean(token_vecs_2, dim=0)
print ("Our final sentence embedding vector of shape:", sentence_embedding_2.size())


torch.Size([12, 46, 768]) torch.Size([46, 768])
Our final sentence embedding vector of shape: torch.Size([768])


In [9]:
len(BERT_embeddings), len(BERT_embeddings[0])

(24876, 768)

In [10]:
# Sanity checks : check if the embeddings of the same word 'SAPL' idx 8703 and 9214 have the same embeddings 
#BERT_embeddings[9214], BERT_embeddings[8703]

In [10]:
# check the similarity 
from scipy.spatial.distance import cosine
idx_1 = 6518
idx_2 = 9223
close_w = cosine(BERT_embeddings[idx_1], BERT_embeddings[idx_2])
print('similarity between',df['term'][idx_1],'and',df['term'][idx_2],'=', close_w)

similarity between dénutries and aeg = 0.19713497161865234


In [11]:
# check the similarity 
from scipy.spatial.distance import cosine
idx_1 = 9223
idx_2 = 1468
close_w = cosine(BERT_embeddings[idx_1], BERT_embeddings[idx_2])
print('similarity between',df['term'][idx_1],'and',df['term'][idx_2],'=', close_w)

similarity between aeg and échec car rein droit atrophique = 0.11182421445846558


In [12]:
# check the similarity 
from scipy.spatial.distance import cosine
idx_1 = 4514
idx_2 = 12697
close_w = cosine(BERT_embeddings[idx_1], BERT_embeddings[idx_2])
print('similarity between',df['term'][idx_1],'and',df['term'][idx_2],'=', close_w)

similarity between essoufflement and atteinte péricardique = 0.11328792572021484


In [18]:
# check the similarity 
from scipy.spatial.distance import cosine
idx_1 =10644
idx_2 = 13693
close_w = cosine(BERT_embeddings[idx_1], BERT_embeddings[idx_2])
print('similarity between "',df['term'][idx_1],'" and "',df['term'][idx_2],'" =', close_w)

similarity between " lupus " and " lupus érythémateux disséminé " = 0.19288426637649536


In [17]:
# check the similarity 
from scipy.spatial.distance import cosine
idx_1 = 2043
idx_2 = 10644
close_w = cosine(BERT_embeddings[idx_1], BERT_embeddings[idx_2])
print('similarity between',df['term'][idx_1],'and',df['term'][idx_2],'=', close_w)

similarity between lupus cutané et articulaire and lupus = 0.18241572380065918


In [19]:
df.sample(30)

Unnamed: 0,source,gender,term,fastext_embeddings,label
2061,CRH_val_sample_90.ann,f,arthralgies,"[12.895119, -16.360558, -16.207384, -20.181244...",etatsosy
21658,CRH_val_sample_337.ann,m,fébrile,"[-7.7521305, -20.864697, 5.5983663, -30.150362...",etatsosy
1787,CRH_val_sample_4.ann,f,syndrome néphrotique,"[6.8223667, 23.82425, -10.157961, -10.0728445,...",urogen
1310,n_CRH_val_sample_17.ann,m,lupus systémique,"[1.6046201, 15.148919, -20.079283, -1.8821199,...",peau
23514,CRH_val_sample_176.ann,m,ulcération active,"[20.367083, 7.6066194, -33.157093, -49.560497,...",etatsosy
19519,CRH_val_sample_245.ann,f,claudication des membres,"[10.340494, 2.0610487, -23.225489, -33.610744,...",etatsosy
15556,CRH_val_sample_71.ann,f,lupus diagnostiqué en 2000 avec atteinte artic...,"[2.5664775, -3.7606556, 10.448557, -12.96499, ...",urogen
5519,n_CRH_val_sample_74.ann,f,calcinose sous-cutanée,"[7.820586, -22.966867, -38.59979, -30.754332, ...",peau
16070,CRH_val_sample_304.ann,m,altération de l'état général,"[-10.539443, 12.294082, -9.516028, -26.595072,...",etatsosy
23269,CRH_val_sample_332.ann,m,avc ischémique,"[2.2781281, 2.2560482, -20.068064, -1.9047202,...",cardiovasculaires


In [20]:
# add these BERT embeddings to our dataFrame 
# empty list of list to "append" each values
data = []
for i in range(len(df.values)):
    data.append([df.index[i], df.values[i,0],df.values[i,1],df.values[i,2],
                 df.values[i,3],
               BERT_embeddings[i],df.values[i,4]])
    
my_df = pd.DataFrame(data)
my_df.columns = ['idx','source', 'gender','term', 'fastext_embeddings', 'EDS_BERT_embeddings',
                 'label']

In [22]:
my_df[my_df.term == "ostéoporose"]

Unnamed: 0,idx,source,gender,term,fastext_embeddings,EDS_BERT_embeddings,label
649,649,n_CRH_val_sample_19.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",nutritionnelles
650,650,n_CRH_val_sample_19.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",osteomusculaires
717,717,n_CRH_val_sample_19.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",nutritionnelles
718,718,n_CRH_val_sample_19.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",osteomusculaires
1890,1890,CRH_val_sample_74.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",nutritionnelles
...,...,...,...,...,...,...,...
20696,20696,CRH_val_sample_253.ann,m,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",osteomusculaires
22114,22114,CRH_val_sample_330.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",nutritionnelles
22115,22115,CRH_val_sample_330.ann,f,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",osteomusculaires
23560,23560,CRH_val_sample_176.ann,m,ostéoporose,"[-25.041595, 2.1417432, -40.796547, -54.253117...","[-0.1463593, 0.57539636, 0.19702095, -0.132929...",nutritionnelles


In [23]:
# save 
my_df.to_pickle(OUT_FILE)

In [26]:
my_df[my_df['term']=='ostéoporose'].EDS_BERT_embeddings

649      [-0.1463593, 0.57539636, 0.19702095, -0.132929...
650      [-0.1463593, 0.57539636, 0.19702095, -0.132929...
717      [-0.1463593, 0.57539636, 0.19702095, -0.132929...
718      [-0.1463593, 0.57539636, 0.19702095, -0.132929...
1890     [-0.1463593, 0.57539636, 0.19702095, -0.132929...
                               ...                        
20696    [-0.1463593, 0.57539636, 0.19702095, -0.132929...
22114    [-0.1463593, 0.57539636, 0.19702095, -0.132929...
22115    [-0.1463593, 0.57539636, 0.19702095, -0.132929...
23560    [-0.1463593, 0.57539636, 0.19702095, -0.132929...
23561    [-0.1463593, 0.57539636, 0.19702095, -0.132929...
Name: EDS_BERT_embeddings, Length: 66, dtype: object

In [None]:
### TUTO BERT EMBEDDINGS
layer_i = 0
print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
# `hidden_states
print('      Type of hidden_states: ', type(hidden_states))
# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings.size()
# `hidden_states` has shape [25x 1906 x 83 x 1024]
# To get a single vector for our entire sentence we have 
# multiple application-dependent strategies, 
# but a simple approach is to average the second to last 
# hiden layer of each token producing a single 768 length vector.

token_vecs = hidden_states[-2:][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=1)

token_vecs.size(), sentence_embedding.size()
for i, token_str in enumerate(train_tokenizer_texts):
    print (i, token_str)
