# BERT multilabel sequence classification 

- Model : CamembertForSequenceClassification

- Input : should be an extracted entity from clinical text
- Output : one label or more from MeSH labels

- Training set : mesh_term_dataset, UMLS dataset, local dataset etc... 



In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, fbeta_score, precision_score, recall_score, accuracy_score
from datetime import datetime

os.environ['CUDA_VISIBLE_DEVICES'] = "0" # ou un autre numéro, d'après ce qui est dispo en faisant nvidia-sm

import re

# Camembert version :
from transformers import CamembertTokenizer, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# multilingual version : 
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import Dataset, TensorDataset, RandomSampler, SequentialSampler, DataLoader
from torch.nn import BCEWithLogitsLoss, Sigmoid


from sklearn.model_selection import train_test_split
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device',device)

# the model : 
bert_name = "/export/home/cse200093/camembert-large/"
# bert_name = "/export/home/cse200093/bert-base-multilingual-cased/"
lower = True
run_name = 'expe_pheno_classifier_camembert_l_umls_fr_local'

batch_size = 32

max_epochs = 50

# validation_mode = 'deft_only' 
# validation_mode = 'random_with_deft_distribution'
validation_mode = 'random'

threshold_mode = 'single'
#threshold_mode = 'per_class'

assert validation_mode in ('deft_only', 'random', 'random_with_deft_distribution')
assert threshold_mode in ('single', 'per_class')

Max_len = 128 # just to reminder : 13 on 9383 samples are larger than 128.
val_rate = .2
debug = 0


training_file = "/export/home/cse200093/Expe_Pheno/resources/08022022_classifier_local_train_val.csv"
test_file = '/export/home/cse200093/Expe_Pheno/resources/15_02_2022_CRH_VAL_PHENO_pred.csv'



device cuda


In [4]:
# load the training file 
df_mesh = pd.read_csv(training_file)

# df_train = df_mesh[(df_mesh.source == "DEFT-train") | (df_mesh.source == "Mesh-FR")]

# Option 1 : try to get all the DEFT term for the validation test
df_train = df_mesh 

# print('df_val', df_val.shape)


# load the test file, i.e. the sosy and path predicted entities from Pyner output
df_test = pd.read_csv(test_file)

if debug >= 2:
    df_train = df_train.sample(100)
    max_epochs = 5
elif debug == 1:
    df_train = df_train.sample(5000)
    
    max_epochs = 5
    
print('df_train', df_train.shape)
print('df_test', df_test.shape)

df_train (60690, 4)
df_test (18818, 3)


In [5]:
df_gr = df_train.groupby(['term','source'])

In [6]:
df_train.sample(5)

Unnamed: 0.1,Unnamed: 0,term,label,source
4394,4394,Aphasie de conduction,nerveux,Mesh-FR
16301,16301,Canal auriculoventriculaire commun,cardiovasculaires,CUI_syn_FR
43049,43049,rétinopathie diabetique débutante,cardiovasculaires,CRH_scl_sample_44.ann
20123,20123,Hypertélorisme,genetique,CUI_syn_FR
6658,6658,Ectopie du cristallin,genetique,Mesh-FR


In [8]:
def parse_all_thresholds(pred_labels, true_bools, criterion=f1_score):
    all_results = []
    macro_thresholds = np.array(range(1,100))/100
    for th in macro_thresholds:
        pred_bools = [pl>th for pl in pred_labels]
        best_value = criterion(true_bools,pred_bools,average='micro')
        #test_flat_accuracy = accuracy_score(true_bools, pred_bools)
        all_results.append(best_value)
        #flat_acc_results.append(test_flat_accuracy)
        #print(th, 
        #      test_f1_accuracy, 
        #      fbeta_score(true_bools,pred_bools,beta=0.5,labels=[1],average='micro'),
        #      fbeta_score(true_bools,pred_bools,beta=0.6666,labels=[1],average='micro'),
        #      precision_score(true_bools,pred_bools,labels=[1],average='micro'),
        #      recall_score(true_bools,pred_bools,labels=[1],average='micro')
        #     ) 

    best_idx = np.argmax(all_results)
    best_th = macro_thresholds[best_idx] #best macro threshold value

    #micro_thresholds = (np.array(range(10))/100)+best_th #calculating micro threshold values
#
    #f1_results, flat_acc_results = [], []
    #for th in micro_thresholds:
    #    pred_bools = [pl>th for pl in pred_labels]
    #    test_f1_accuracy = f1_score(true_bools,pred_bools,labels=[1],average='micro')
    #    test_flat_accuracy = accuracy_score(true_bools, pred_bools)
    #    f1_results.append(test_f1_accuracy)
    #    flat_acc_results.append(test_flat_accuracy)
#
    #best_f1_idx = np.argmax(f1_results) #best threshold value
    #best_th = micro_thresholds[best_f1_idx]
    return best_th, all_results[best_idx]
 

def get_best_threshold(pred_labels, true_bools, threshold_mode, criterion=f1_score):
    if threshold_mode == 'per_class':
        best_thresholds = []
        best_pred_bools = np.zeros_like(pred_labels)
        for label_index in range(len(label_names)):
            these_pred_labels = pred_labels[:,label_index]
            these_true_bools = true_bools[:,label_index]

            best_th, best_value = parse_all_thresholds(these_pred_labels, these_true_bools, criterion=criterion)

            # Printing and saving classification report
            best_thresholds.append(best_th)

            best_pred_bools[:,label_index] = [pl>best_th for pl in these_pred_labels]


    elif threshold_mode == 'single':
        best_th, best_value = parse_all_thresholds(pred_labels, true_bools, criterion=criterion)
        best_pred_bools = [pl>best_th for pl in pred_labels]
        best_thresholds = [best_th] * len(label_names)
    clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=col_label)
    return best_thresholds, best_value, clf_report_optimized

In [9]:
# Pre-processing of the text :   
import unidecode

def strip(text, lower=False):
    # pattern = r"[^a-zA-z0-9\s,']"
    # text = unidecode.unidecode(text)
    text = re.sub(r'^-','',text, count = 1)
    if lower:
        return text.lower()
    else:
        return text


In [10]:
df_test['term'] = df_test['term'].apply(strip, args=(lower,))
df_test.sample(5)

Unnamed: 0.1,Unnamed: 0,term,source
12432,12432,lipodermatosclérose,CRH_val_sample_130.ann
10818,10818,grossesse,CRH_val_sample_329.ann
15007,15007,lupus systémique à début pédiatrique avec pous...,CRH_val_sample_85.ann
12098,12098,entorse de la cheville,CRH_val_sample_95.ann
17740,17740,odynophagie,NEW_CRH_val_sample_26.ann


In [11]:
def extract_validation_set_with_weights(df, label_names, weights, val_rate):
    # Copy df
    local_df = df.copy()
    # Class distribution (only approximate, because of the multilabel nature of the data)
    class_distrib = weights/weights.sum()
    # How many lines should we draw?
    val_size = int(len(df) * val_rate)
    # How many lines of each class should we draw?
    val_sizes = np.around(val_size * class_distrib).astype(int)
    df_val = None
    # Select and remove the right number of raws per class
    # (leads to an approximate distribution because of the multilabel nature of the data)
    for label_name, number in zip(label_names, val_sizes):
        label_df = local_df[local_df[label_name] > 0]
        sample = label_df.sample(number, random_state=2)
        if df_val is None:
            df_val = sample.copy()
        else:
            df_val = df_val.append(sample)
        local_df = local_df.drop(sample.index)
    return local_df, df_val



In [12]:
# fusion "homme" and "femme" label to "urogen"
df_train.replace('femme','urogen', inplace = True)
df_train.replace('homme','urogen', inplace = True)

# Preprocessing
# 1. Transform the dataFrame so that each label "nerveux", "etatsosy", "chimiques" etc... is a column
#    and all the values are 0 or 1
# 2. Apply text transformation
df_deft_train = df_train[df_train['source'].str.contains('DEFT')]    
df_deft_train = pd.get_dummies(df_deft_train, prefix = None, columns = ['label'])
df_deft_train['term'] = df_deft_train['term'].apply(strip, args=(lower,))        
df_deft_train = df_deft_train.groupby(['term']).max().reset_index()    
label_names = [l for l in df_deft_train.columns if l.startswith('label_')]    
pos_weights = torch.Tensor(df_deft_train[label_names].sum(axis=0).to_numpy()/len(df_deft_train))
pos_weights = pos_weights/pos_weights.mean()

# validate on DEFT only
if validation_mode == 'deft_only':
    df_train_train = df_train[~df_train['source'].str.contains('DEFT')]
    df_train_train = pd.get_dummies(df_train_train, prefix = None, columns = ['label'])
    df_train_train['term'] = df_train_train['term'].apply(strip, args=(lower,))        
    df_train_train = df_train_train.groupby(['term']).max().reset_index()    

    df_train_val = df_deft_train
    print(df_train_train.shape, df_train_val.shape)
    
    train_labels = df_train_train[label_names].to_numpy()
    val_labels = df_train_val[label_names].to_numpy()
    train_texts = list(df_train_train['term'])
    val_texts = list(df_train_val['term'])
    
# validate on a random sample of DEFT+terms
elif validation_mode == 'random':
    df_train = pd.get_dummies(df_train, prefix = None, columns = ['label'])
    df_train = df_train.groupby(['term']).max().reset_index()
    df_train['term'] = df_train['term'].apply(strip, args=(lower,))
    # split the train set into a training and validation dataset :
    train_sentences = list(df_train['term'])
    labels = df_train[label_names].to_numpy()
    
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_sentences, 
                                                                        labels, 
                                                                        test_size=val_rate,
                                                                        random_state=2)
    print(len(train_texts), len(val_texts))
    
# validate on a random sample of DEFT+terms, with approximate DEFT class distribution
elif validation_mode == 'random_with_deft_distribution':
    df_train = pd.get_dummies(df_train, prefix = None, columns = ['label'])
    df_train = df_train.groupby(['term']).max().reset_index()
    df_train['term'] = df_train['term'].apply(strip, args=(lower,))

    df_train_train, df_train_val = extract_validation_set_with_weights(df_train, label_names, pos_weights.numpy(), val_rate)
    train_labels = df_train_train[label_names].to_numpy()
    val_labels = df_train_val[label_names].to_numpy()
    train_texts = list(df_train_train['term'])
    val_texts = list(df_train_val['term'])
else:
    raise ValueError(validation_mode)
    
col_label = [l[6:] for l in label_names]
#print(labels)
print(col_label)

24784 6197
['ORL', 'blessures', 'cardiovasculaires', 'chimiques', 'digestif', 'endocriniennes', 'etatsosy', 'genetique', 'hemopathies', 'immunitaire', 'infections', 'nerveux', 'nutritionnelles', 'oeil', 'osteomusculaires', 'parasitaires', 'peau', 'respiratoire', 'stomatognathique', 'tumeur', 'urogen', 'virales']


In [24]:
#pb = 0
#for _, raw in df_train_val.iterrows():
#    term = raw['term']
#    tr_df = df_train_train[df_train_train['term'] == term]
#    if len(tr_df) > 0:
#        #print(term, 'DEFT')
#        deft_set = set()
#        for l in label_names:
#            if raw[l] > 0:
#                deft_set.add(l)
#        #print(term, "DEFT", deft_set)
#        #print(term, 'OTHERS ', len(tr_df))
#        other_set = set()
#        i = 0
#        for _, r in tr_df.iterrows():
#            for l in label_names:
#                if r[l] > 0:
#                    assert not (i > 0 and l not in other_set), str(other_set) + " " + l
#                    other_set.add(l)
#            i += 1
#        if other_set != deft_set:
#            print(term, deft_set, ' vs ', other_set)
#            pb += 1
#print(pb)
#        #print(term, "OTHER", other_set)
#        #print('---------------')

In [13]:
df_test['term'] = df_test['term'].apply(strip, args=(lower,))

# val_texts = list(df_val['term'])
test_sentences = list(df_test['term'])

In [15]:
# define the tokenizer : 
if 'camembert' in bert_name.lower():
    tokenizer = CamembertTokenizer.from_pretrained(bert_name, do_lower_case = lower)
else:
    tokenizer = BertTokenizer.from_pretrained(bert_name, do_lower_case = lower)


# tokenization of the train, val and test dataset : 
train_tokenizer_texts = tokenizer(train_texts, return_tensors = 'pt', padding=True, truncation=True, max_length = Max_len)
val_tokenizer_texts = tokenizer(val_texts, return_tensors = 'pt', padding=True, truncation=True, max_length = Max_len)
test_tokenizer_texts = tokenizer(test_sentences, return_tensors = 'pt',padding=True, truncation=True, max_length = Max_len)


print('train_size ', train_tokenizer_texts['input_ids'].size())
print('val_size ', val_tokenizer_texts['input_ids'].size())
print('test_size ', test_tokenizer_texts['input_ids'].size())

attention_masks = train_tokenizer_texts['attention_mask']
val_attention_masks = val_tokenizer_texts['attention_mask']
test_attention_masks = test_tokenizer_texts['attention_mask']

# test_token_type_ids = test_tokenizer_texts['token_type_ids']


attention_masks[0] , val_attention_masks[0], test_attention_masks[0]
train_labels.shape, val_labels.shape

train_size  torch.Size([24784, 90])
val_size  torch.Size([6197, 121])
test_size  torch.Size([18818, 57])


((24784, 22), (6197, 22))

In [16]:
#Dataset wrapping tensors.

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_tokenizer_texts, train_labels)
val_dataset = IMDbDataset(val_tokenizer_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

num_labels = len(label_names)


In [None]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
PATIENCE = 5
left_patience = PATIENCE
best_value = 0
best_epoch = 0

now = datetime.now().strftime('%Y%m%dT%H%M%S')
model_name = now + '_' + run_name

print('device', device)

if 'camembert' in bert_name.lower():
    model = CamembertForSequenceClassification.from_pretrained(bert_name, num_labels = num_labels)
else:
    model = BertForSequenceClassification.from_pretrained(bert_name, num_labels=num_labels)

model.to(device)
pos_weights = pos_weights.to(device)

# define the parameters 
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optim = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=True)

for epoch in range(max_epochs):
    ### training step 
    print(f'EPOCH {epoch} / {max_epochs}')
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0  
    for _, batch in enumerate(tqdm(train_loader)):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        assert labels.shape[1:] == (num_labels, ), labels.shape
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        #Calculate the loss between multilabel predicted outputs and actuals
        logits = outputs[0]
        assert logits.shape[1:] == (num_labels, ), logits.shape
        loss_func = BCEWithLogitsLoss(pos_weight=pos_weights) 
        loss = loss_func(logits, labels.type_as(logits)) #convert labels to float for calculation
        # loss = loss_func(outputs.logits, labels.type_as(outputs.logits))
        # loss = outputs[0]
        loss.backward()
        optim.step()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
    #Print the current training loss 
    print("Train Loss: {}".format(tr_loss/nb_tr_examples))
    
    
    # PREDICT : 
    model.eval()
    # Variables to gather full output
    logit_preds,true_labels,pred_labels_1,tokenized_texts = [],[],[],[]
    
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            logit_pred = outs[0]
            pred_label = torch.sigmoid(logit_pred)
            logit_pred = logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            labels = labels.to('cpu').numpy()
            tokenized_texts.append(input_ids)
            logit_preds.append(logit_pred)
            true_labels.append(labels)
            pred_labels_1.append(pred_label)
    
    # Flatten outputs
    pred_labels = [item for sublist in pred_labels_1 for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]
    true_bools = [tl==1 for tl in true_labels]
    
    # Calculate Accuracy
    #threshold = 0.5
    #pred_bools = [pl>threshold for pl in pred_labels]
    #val_f1 = f1_score(true_bools,pred_bools,average='micro')*100
    #best_thresholds = [threshold]

    best_thresholds, val_f1, _ = get_best_threshold(pred_labels, true_bools, threshold_mode)
    check_val_value = val_f1

    print('F1 Validation Accuracy: {}      (with threshold {})'.format(val_f1, best_thresholds[0] if len(set(best_thresholds)) == 1 else best_thresholds))
    #print('Flat Validation Accuracy: ', val_flat_accuracy)
    # New best epoch
    if check_val_value > best_value:
        best_value = check_val_value
        left_patience = PATIENCE
        # saving the model 
        model.save_pretrained(model_name+ '_best')
        print(f'Best model so far... reset patience to {left_patience}')
        print(f'Saved model {model_name} after epoch {epoch}')
        best_epoch = epoch
    # Still stuck at 0 at the beginning? Be more patient
    elif best_value == 0 and epoch < 5:
        print('Still the beginning, give it some time...')
    # Not best epoch
    else:
        left_patience -= 1
        print(f'Not the best model, decreasing patience to {left_patience}')
    if left_patience == 0:
        print(f'Out of patience, break after epoch {epoch}')
        break
    
#model.save_pretrained("classifier_camembert_large_UMLS_CUI_SYN_FR_50.pt")
#print(f'Saved model {model_name} after epoch {epoch}')
print(f'Kept model {model_name} from epoch {best_epoch}')




device cuda


Some weights of the model checkpoint at /export/home/cse200093/camembert-large/ were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at /export/home/cse200093/camembert-large/ and

EPOCH 0 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:35<00:00,  1.52s/it]


Train Loss: 0.007918962425978589


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.4634424546595186      (with threshold 0.3)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 0
EPOCH 1 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:40<00:00,  1.52s/it]


Train Loss: 0.004693733084540431


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.5791610284167793      (with threshold 0.33)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 1
EPOCH 2 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:40<00:00,  1.52s/it]


Train Loss: 0.0037316963659194185


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.6754640530033401      (with threshold 0.33)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 2
EPOCH 3 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:40<00:00,  1.52s/it]


Train Loss: 0.0031141571657283605


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.6950826599059842      (with threshold 0.28)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 3
EPOCH 4 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:40<00:00,  1.52s/it]


Train Loss: 0.0026835007617085345


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.7293914544669832      (with threshold 0.33)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 4
EPOCH 5 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:45<00:00,  1.53s/it]


Train Loss: 0.0023120659163908333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.7422988628476856      (with threshold 0.34)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 5
EPOCH 6 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:43<00:00,  1.53s/it]


Train Loss: 0.0020410095448524547


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.7576619273301738      (with threshold 0.4)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 6
EPOCH 7 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:40<00:00,  1.52s/it]


Train Loss: 0.0017890146706677768


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.7753608247422681      (with threshold 0.28)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 7
EPOCH 8 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:42<00:00,  1.53s/it]


Train Loss: 0.0015505035755809533


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.7901695442526054      (with threshold 0.32)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 8
EPOCH 9 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:35<00:00,  1.52s/it]


Train Loss: 0.0013842995962749726


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.7947804213394822      (with threshold 0.39)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 9
EPOCH 10 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:37<00:00,  1.52s/it]


Train Loss: 0.001245468222962126


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8036091177712115      (with threshold 0.4)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 10
EPOCH 11 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:40<00:00,  1.52s/it]


Train Loss: 0.0011190374642553432


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8086298101856655      (with threshold 0.28)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 11
EPOCH 12 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:38<00:00,  1.52s/it]


Train Loss: 0.0009901169982869263


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8117159149818559      (with threshold 0.35)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 12
EPOCH 13 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:42<00:00,  1.53s/it]


Train Loss: 0.0009188268442197508


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8205495573862764      (with threshold 0.29)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 13
EPOCH 14 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:42<00:00,  1.53s/it]


Train Loss: 0.0008194571605350237


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8227254181966545      (with threshold 0.28)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 14
EPOCH 15 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:39<00:00,  1.52s/it]


Train Loss: 0.0007418799964113436


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8234036052551176      (with threshold 0.28)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 15
EPOCH 16 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:38<00:00,  1.52s/it]


Train Loss: 0.0006766601433062697


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8306305372778613      (with threshold 0.37)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 16
EPOCH 17 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:44<00:00,  1.53s/it]


Train Loss: 0.0006307441741378414


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8354153653966271      (with threshold 0.41)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 17
EPOCH 18 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:43<00:00,  1.53s/it]


Train Loss: 0.0005792416808977928


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  # Remove the CWD from sys.path while we load stuff.


F1 Validation Accuracy: 0.833273130708499      (with threshold 0.44)
Not the best model, decreasing patience to 4
EPOCH 19 / 50


100%|██████████| 775/775 [19:42<00:00,  1.53s/it]


Train Loss: 0.0005429209921472351


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8359442771852835      (with threshold 0.31)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 19
EPOCH 20 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:42<00:00,  1.53s/it]


Train Loss: 0.00047815492079972624


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.836577077540933      (with threshold 0.32)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 20
EPOCH 21 / 50


  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 775/775 [19:41<00:00,  1.53s/it]


Train Loss: 0.0004610894356864481


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


F1 Validation Accuracy: 0.8381725888324872      (with threshold 0.23)


  0%|          | 0/775 [00:00<?, ?it/s]

Best model so far... reset patience to 5
Saved model 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local after epoch 21
EPOCH 22 / 50


  # Remove the CWD from sys.path while we load stuff.
  8%|▊         | 61/775 [01:32<18:06,  1.52s/it]

In [46]:
## Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 
# then with 'micro_thresholds' on the order of e^-2
import pickle

# model_name = '20210702T120040_classifier_camembert_umls_fr_en_translated'
# model_name = '20220208T153725_expe_pheno_classifier_camembert_l_umls_fr_local'
model_path = model_name + '_best'


print(f'Load model from {model_path}')
if 'camembert' in bert_name.lower():
    model = CamembertForSequenceClassification.from_pretrained(model_path, 
                                                               num_labels = num_labels)
else:    
    model = BertForSequenceClassification.from_pretrained(model_path, 
                                                          num_labels = num_labels)
    
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
    
model.eval()
# Variables to gather full output
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
        logit_pred = outs[0]
        pred_label = torch.sigmoid(logit_pred)
        logit_pred = logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
        labels = labels.to('cpu').numpy()
        tokenized_texts.extend(input_ids)
        logit_preds.extend(logit_pred)
        true_labels.extend(labels)
        pred_labels.extend(pred_label)

# Flatten outputs
#pred_labels = [item for sublist in pred_labels_1 for item in sublist]
#true_labels = [item for sublist in true_labels for item in sublist]

# pred_labels: (N x num_labels)
pred_labels = np.array(pred_labels)

true_bools = [tl==1 for tl in true_labels]
true_bools = np.array(true_bools)
print('done')

Load model from 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local_best


  # Remove the CWD from sys.path while we load stuff.


done


In [47]:
best_thresholds, best_value, clf_report_optimized = get_best_threshold(pred_labels, true_bools, threshold_mode, criterion=precision_score)
thresholds_path = f'./out/thresholds_{model_name}.csv'
print('Best Thresholds: ', best_thresholds) 
np.savetxt(thresholds_path, np.array(best_thresholds), 
              delimiter=",")
print(f'Best thresholds saved to {thresholds_path}')

classif_report_path = f'./results/classification_report_{model_name}.txt'
pickle.dump(clf_report_optimized, open(classif_report_path,'wb'))
print(f'Classification report saved to {classif_report_path}')
labels_path = f'./out/labels_{model_name}.csv'
with open(labels_path, 'w') as f_out:
    f_out.write(','.join(label_names) + '\n')
print(f'Label names saved to {labels_path}')
print(clf_report_optimized)

Best Thresholds:  [0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99]
Best thresholds saved to ./out/thresholds_20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local.csv
Classification report saved to ./results/classification_report_20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local.txt
Label names saved to ./out/labels_20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local.csv
                   precision    recall  f1-score   support

              ORL       0.00      0.00      0.00       121
        blessures       0.95      0.40      0.57       225
cardiovasculaires       0.94      0.74      0.83       740
        chimiques       0.97      0.82      0.89       163
         digestif       0.95      0.76      0.85       404
   endocriniennes       0.97      0.41      0.58       210
         etatsosy       0.87      0.72      0.79      1545
        genetique       0.96     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
model_name = '20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local'

model_path = model_name + '_best'
# predicting the data 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Create test dataloader
# test_inputs, test_masks, test_labels, test_token_types
test_dataset = TensorDataset(test_tokenizer_texts['input_ids'], test_tokenizer_texts['attention_mask'])
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16)

final_outputs = []

labels_path = f'./out/labels_{model_name}.csv'
with open(labels_path, 'r') as f_out:
    label_names = f_out.readline().strip().split(',')
print(f'Label names loaded from {labels_path}')

num_labels = len(label_names)
# load the model 
model_path = model_name + '_best'
print(f'Load model from {model_path}')
if 'camembert' in bert_name.lower():
    model = CamembertForSequenceClassification.from_pretrained(model_path, 
                                                               num_labels = num_labels)
else:    
    model = BertForSequenceClassification.from_pretrained(model_name, 
                                                          num_labels = num_labels)

model.to(device)

#Iterate over the test_loader 
for step, batch in enumerate(test_dataloader):
    #Transfer batch to GPUs --> not adapted for EDS computation
    batch = tuple(t.to(device) for t in batch)
    #We dont need to update gradients as we are just predicting
    with torch.no_grad():
        #Bring up the next batch of input_texts and attention_masks 
        b_input_ids, b_input_mask = batch
        #Forward propogate the inputs and get output as logits
        outputs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask)
        #Pass the outputs through a sigmoid function to get the multi-label predictions
        s = Sigmoid()
        out = s(outputs.logits).to('cpu').numpy()    
        #Add the predictions for this batch to the final list
        final_outputs.extend(out)
            
#Merge test df and submission table to have all columns in a table
# df_test = pd.merge(df_test, sample_submission, on = "id")
#Assign the predictions to the labels columns
df_test[label_names] = final_outputs


# Saving the dataframe
result_path = f'./out/final_CRH_VAL_PHENO_pred_{model_name}.csv'
df_test.to_csv(result_path, index = False)
print(f'Results save to {result_path}')

Label names loaded from ./out/labels_20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local.csv
Load model from 20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local_best
Results save to ./out/final_CRH_VAL_PHENO_pred_20220209T092729_expe_pheno_classifier_camembert_l_umls_fr_local.csv


In [18]:
len(df_test)

18818

In [19]:
df_check = df_test.groupby('source')['term'].nunique()

df_check

source
CRH_val_sample_0.ann          25
CRH_val_sample_1.ann          52
CRH_val_sample_10.ann         82
CRH_val_sample_100.ann        43
CRH_val_sample_101.ann        29
                            ... 
NEW_CRH_val_sample_9.ann      38
NEW_CRH_val_sample_90.ann    120
NEW_CRH_val_sample_92.ann     59
NEW_CRH_val_sample_97.ann     48
NEW_CRH_val_sample_99.ann     48
Name: term, Length: 256, dtype: int64