Train a BERT model for sequence classification

In [108]:
import pickle

import numpy as np
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

import random


In [109]:
#claim_type_frame = pickle.load( open( "pickles/claim_type_frame_bt_et.p", "rb" ) )
#claim_type_frame = pickle.load( open( "pickles/claim_type_frame_bf_ef.p", "rb" ) )
#claim_type_frame = pickle.load( open( "pickles/claim_type_frame_bf_et.p", "rb" ) )
#claim_type_frame = pickle.load( open( "pickles/claim_type_frame_bt_et.p", "rb" ) )

#claim_type_frame=pickle.load( open("pickles/match_bert_frame_et_small.p", "rb"))
#claim_type_frame=pickle.load( open("pickles/match_bert_frame_ef_small.p", "rb"))
#claim_type_frame=pickle.load( open ("pickles/match_bert_frame_et_big.p ", "rb"))
#claim_type_frame=pickle.load(open("pickles/match_bert_frame_ef_big.p", "rb"))

claim_type_frame=pickle.load( open("pickles/match_bert_frame_et_micro.p", "rb"))


In [110]:
claim_type_frame


Unnamed: 0,Text_prem,Text_sup,Label,text
10,Not only because I 'm constantly astonished ab...,And yet now I 'll actually have all the more t...,1,Not only because I 'm constantly astonished ab...
6,The doctor was able to sign him off as fit for...,but that would only make the injury worse in t...,1,The doctor was able to sign him off as fit for...
9,In a few decades our society will have a dispr...,Although having children has become more fashi...,1,In a few decades our society will have a dispr...
0,"Having said that , state funds are often waste...",so that this ideal appears unworthy of support...,0,"Having said that , state funds are often waste..."
2,Plus he comes to training way too rarely,although he plays very well without training,1,Plus he comes to training way too rarely altho...
...,...,...,...,...
2,People are getting older on average,"No , the retirement age should be raised to 65...",0,"People are getting older on average No , the r..."
6,and surely you 'll want to be speeding a lot w...,The steering system may still need a bit of tw...,0,and surely you 'll want to be speeding a lot w...
3,Also streets and shops would presumably be,Supermarkets and shopping centres should be al...,0,Also streets and shops would presumably be Sup...
0,"Besides , a higher degree of luxury ( expensiv...",A stop must be put to exorbitant rents by impo...,0,"Besides , a higher degree of luxury ( expensiv..."


In [111]:
claim_type_frame.rename(columns={'Label': 'Values'}, inplace=True)
claim_type_frame.reset_index(inplace=True)
claim_type_frame["Values"].value_counts()

1    464
0    464
Name: Values, dtype: int64

In [112]:
#make a dict which contains all possible labels
possible_labels = claim_type_frame.Values.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
#label_dict

#create a "label" for training
#claim_type_frame['label'] = claim_type_frame['Values']
claim_type_frame['label'] = claim_type_frame.Values.replace(label_dict)

In [113]:
#split the data into training and validation set
X_train, X_val, y_train, y_val = train_test_split(claim_type_frame.index.values, 
                                                  claim_type_frame.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=claim_type_frame.label.values)





claim_type_frame['data_type'] = ['not_set']*claim_type_frame.shape[0]


claim_type_frame.loc[X_train, 'data_type'] = 'train'
claim_type_frame.loc[X_val, 'data_type'] = 'val'

claim_type_frame.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Text_prem,Text_sup,Values,text
label,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,train,394,394,394,394,394
0,val,70,70,70,70,70
1,train,394,394,394,394,394
1,val,70,70,70,70,70


In [114]:
claim_type_frame["data_type"].value_counts()

train    788
val      140
Name: data_type, dtype: int64

In [115]:
#tokenize the Dataset with the general BERT tokenizer and create Tensors
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    claim_type_frame[claim_type_frame.data_type=='train'].text.values, 
    add_special_tokens=True, 
    padding="longest",
    return_attention_mask=True, 
    return_tensors='pt',
    truncation=True
)

encoded_data_val = tokenizer.batch_encode_plus(
    claim_type_frame[claim_type_frame.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True,
    padding="longest", 
    return_tensors='pt',
    truncation=True
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(claim_type_frame[claim_type_frame.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(claim_type_frame[claim_type_frame.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [116]:
#load the model use the general BERT model for multi label classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)



#initalize dataloader
batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

#initalize optimizer
optimizer=torch.optim.AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [117]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [118]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.6618010049310474
Validation loss: 0.5729419027871274
F1 Score (Weighted): 0.7059275577642298


Epoch 2:   0%|          | 0/263 [00:00<?, ?it/s]

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)


model.load_state_dict(torch.load('Modelle/finetuned_BERT_epoch_8_frame_1_b8.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: 1
Accuracy: 54/70

Class: 0
Accuracy: 31/70

