In [1]:
!pip install Sentencepiece
!pip install transformers

Collecting Sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 19.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 21.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 14.7 MB/s eta 0:00:01[K     |█                               | 40 kB 10.6 MB/s eta 0:00:01[K     |█▍                              | 51 kB 10.4 MB/s eta 0:00:01[K     |█▋                              | 61 kB 9.7 MB/s eta 0:00:01[K     |██                              | 71 kB 8.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 9.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 8.9 MB/s eta 0:00:01[K     |██▊                             | 102 kB 9.2 MB/s eta 0:00:01[K     |███                             | 112 kB 9.2 MB/s eta 0:00:01[K     |███▎                            | 122 kB 9.2 MB/s eta 0:00:01[K     |███▌     

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BigBirdTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BigBirdForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import nltk

In [3]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('drive/MyDrive/machine_learning')

In [5]:
# # load dataset

data_train = pd.read_csv('dataset_binary_train.csv')[0:128]
data_test = pd.read_csv('dataset_binary_test.csv')[0:128]

X_train, y_train = data_train.data.tolist(), data_train.label.tolist()
X_test, y_test = data_test.data.tolist(), data_test.label.tolist()

print('Train dataset length: {}'.format(len(X_train)))
print('Test dataset length: {}'.format(len(X_test)))

Train dataset length: 128
Test dataset length: 128


In [6]:
class textCNN(nn.Module):
    def __init__(self, inplane=1, input_dim=768, num_conv=3, conv_size=[2,3,4], dropout_prob=0, dim_output=2):
        super(textCNN, self).__init__()
        
        D_words = input_dim # dimension of word embedding
        self.convs = nn.ModuleList([nn.Conv2d(inplane,num_conv,(K,input_dim)) for K in conv_size]) ## list of convolutional layers
        self.dropout = nn.Dropout(dropout_prob) 
        self.fc = nn.Linear(len(conv_size)*num_conv, dim_output) 
        
    def forward(self,x):
        #x.size = (batch_size, sequence_length, word_embedding)
        
        x = x.unsqueeze(1) #(N,C,W,D) (C=1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # len(conv_size)*(N,num_conv,W)
        x = [F.max_pool1d(line,line.size(2)).squeeze(2) for line in x]  # len(conv_size)*(N,num_conv)
        
        x = torch.cat(x,1) #(N,num_conv*len(conv_size))
        x = self.dropout(x)
        logit = self.fc(x)
        return logit

In [7]:
class LSTM_attention(nn.Module):
    def __init__(self, input_dim=768, hidden_size=256, num_layers=1, dim_output=2, bi_directional=True):
        super(LSTM_attention, self).__init__()
        
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bi_directional, bias=True)
        self.fc = nn.Linear((int(bi_directional)+1) * hidden_size, dim_output)

    def attention_layer(self,lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        batch_size = len(lstm_output)
        hidden = torch.cat((final_state[0], final_state[1]), dim=1).unsqueeze(2)
        # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
        # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights,1)

        # context: [batch_size, n_hidden * num_directions(=2)]
        context = torch.bmm(lstm_output.transpose(1,2),soft_attn_weights.unsqueeze(2)).squeeze(2)

        return context, soft_attn_weights

    def forward(self, inputs):
        output, (final_hidden_state, final_cell_state) = self.lstm(inputs.permute(1, 0, 2))
        atten_output, attention = self.attention_layer(output.permute(1, 0, 2), final_hidden_state)
        output = self.fc(atten_output)
        
        return output

In [8]:
class Transformer:
    def __init__(self, model_name, num_labels=2, **kwargs):
        super(Transformer, self).__init__()
    
        if model_name == 'BERT':
            self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = num_labels, output_attentions = False, output_hidden_states = True)
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        elif model_name == 'GPT2':
            self.model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels = num_labels, output_attentions = False, output_hidden_states = True)
            self.model.config.pad_token_id = self.model.config.eos_token_id
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
            self.tokenizer.pad_token = self.tokenizer.eos_token 
        elif model_name == 'BIGBIRD':
            self.model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base', num_labels = num_labels, output_attentions = False, output_hidden_states = True)
            self.tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base', do_lower_case=True)

    def preprocess_data(self, X_train, X_test, y_train, y_test):

        for mode in ['train', 'test']:

            sample_ids = []
            attention_masks = []

            samples = X_train if mode == 'train' else X_test
            labels = y_train if mode == 'train' else y_test
            length = len(samples)

            for i, sent in enumerate(samples):
                encoded_dict = self.tokenizer.encode_plus(sent, add_special_tokens = True, max_length = 100, truncation = True, \
                                                  padding = 'max_length', return_attention_mask = True, return_tensors = 'pt')

                # Add the encoded sample and mask 
                sample_ids.append(encoded_dict['input_ids'])
                attention_masks.append(encoded_dict['attention_mask'])
                print('\r----- Processing {}/{} {} samples'.format(i+1, length, mode), flush=True, end='')

            # Convert to pytorch tensors.
            sample_ids = torch.cat(sample_ids, dim=0)
            attention_masks = torch.cat(attention_masks, dim=0)
            labels = torch.tensor(labels)

            if mode == 'train': train_dataset = TensorDataset(sample_ids, attention_masks, labels)
            else: test_dataset = TensorDataset(sample_ids, attention_masks, labels)
        print('\n')
        
        return train_dataset, test_dataset

In [9]:
def train_val(train_dataset, test_dataset, transformer_name, transformer, classifier_name=None, classifier=None, lr_transformer=3e-5, lr_classifier=1e-3, batch_size=64, max_epoch=5):
    # create dataloader for tensor dataset
    train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
    val_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)
    
    # define device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # use cuda for transformer
    transformer = transformer.model.to(device)
    
    # define models
    if classifier is not None:
        classifier = classifier.to(device)
        optimizer = torch.optim.Adam([{"params": classifier.parameters(), 'lr': lr_classifier}])
        for p in transformer.parameters(): # freeze the layers of transformer
            p.requires_grad = False
    else:
        optimizer = torch.optim.Adam(transformer.parameters(), lr = lr_transformer) # the learning rate is suggested by the authors

        for p in transformer.parameters():
            p.requires_grad = True

    
    # Hyper-parameters
    max_epoch = 5
    n_batch = int(len(train_dataset)/batch_size)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_dataloader) * max_epoch)
    criterion = F.cross_entropy
    
    # clean memory in GPU
    torch.cuda.empty_cache()
    
    # a list to record the state of training
    training_stats = []

    opt_val_acc = 0 # initialize the optimal test accuracy to be 0
            
    print('Training start!')
    for e in range(max_epoch):
        
        # train model
        # model.train()
        if classifier is not None:
            classifier.train()
        else:
            transformer.train()
        
        epoch_loss = 0
        train_acc = 0
        
        for b, (x_id, x_mask, y) in enumerate(train_dataloader):
            x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
            
            optimizer.zero_grad()
            
            if classifier is not None:
                with torch.no_grad():
                    word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                logits = classifier(word_embedding)
                loss = criterion(logits, y)
            else:
                outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                loss, logits = outputs['loss'], outputs['logits']


            epoch_loss += loss
            train_acc += (logits.max(1)[1] == y).float().mean().item()
            
            loss.backward()
            
            #clip gradient
            if classifier is None:
                torch.nn.utils.clip_grad_norm_(transformer.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            print("\rEpoch: {:d} batch: {:d} / {} loss: {:.4f} | {:.2%}".format(e + 1, b, n_batch, loss, b*1.0/n_batch), end='', flush=True)
        print("\n----- Epoch {} ------\nTraining loss: {}".format(e+1, epoch_loss / len(train_dataloader)))
        print("Training accuracy: {}".format(train_acc / len(train_dataloader)))

        
        # evaluate model
        if classifier is not None:
            classifier.eval()
        transformer.eval()
        
        eval_acc = 0
        eval_loss = 0
        nb_eval_steps = 0
        
        for b, (x_id, x_mask, y) in enumerate(val_dataloader):
            x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
            
            with torch.no_grad():
                if classifier is not None:
                    word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                    logits = classifier(word_embedding)
                    loss = criterion(logits, y)
                else:
                    outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                    loss, logits = outputs['loss'], outputs['logits']
            
            eval_loss += loss
            eval_acc += (logits.max(1)[1] == y).float().mean().item()

        print("Validation loss: {}".format(eval_loss / len(val_dataloader)))
        print("Validation accuracy: {}".format(eval_acc / len(val_dataloader)))
        print("\n")
        
        training_stats.append(
            {
                'epoch': e+1,
                'train_loss': epoch_loss / len(train_dataloader),
                'train_acc': train_acc / len(train_dataloader),
                'val_loss': eval_loss / len(val_dataloader),
                'val_acc': eval_acc / len(val_dataloader),
            }
        )
        
        # save models
        # only save the model if the test accuracy is improved
        if  eval_acc / len(val_dataloader) >= opt_val_acc:
            opt_val_acc = eval_acc / len(val_dataloader)
            if classifier is not None:
                torch.save(classifier, '{}-{}.pkl'.format(transformer_name, classifier_name))
            else:
                torch.save(transformer, '{}.pkl'.format(transformer_name))
        
        # save states of training
        np.save('{}-{}-train_stats_Epoch{}.npy'.format(transformer_name, classifier_name, e+1), training_stats) 

    print('Training complete!')

**Fine-tune BERT**

In [None]:
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samplesTraining start!
Epoch: 1 batch: 1 / 2 loss: 0.7114 | 50.00%
----- Epoch 1 ------
Training loss: 0.6891252994537354
Training accuracy: 0.578125
Validation loss: 0.738847017288208
Validation accuracy: 0.4609375




**Fine-tune GPT2**

In [None]:
transformer = Transformer('GPT2')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='GPT2', transformer=transformer)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

----- Processing 128/128 test samplesTraining start!
Epoch: 1 batch: 1 / 2 loss: 1.0671 | 50.00%
----- Epoch 1 ------
Training loss: 1.1728076934814453
Training accuracy: 0.578125
Validation loss: 1.2852586507797241
Validation accuracy: 0.4609375


Epoch: 2 batch: 1 / 2 loss: 0.7968 | 50.00%
----- Epoch 2 ------
Training loss: 0.906671404838562
Training accuracy: 0.5703125
Validation loss: 1.088007926940918
Validation accuracy: 0.4609375


Epoch: 3 batch: 1 / 2 loss: 0.6855 | 50.00%
----- Epoch 3 ------
Training loss: 0.7980347871780396
Training accuracy: 0.59375
Validation loss: 0.9689717292785645
Validation accuracy: 0.453125


Epoch: 4 batch: 1 / 2 loss: 0.7227 | 50.00%
----- Epoch 4 ------
Training loss: 0.7135763168334961
Training accuracy: 0.59375
Validation loss: 0.9084379076957703
Validation accuracy: 0.453125


Epoch: 5 batch: 1 / 2 loss: 0.6443 | 50.00%
----- Epoch 5 ------
Training loss: 0.6510866284370422
Training accuracy: 0.625
Validation loss: 0.8848785161972046
Validati

**Fine-tune BIGBIRD**

In [None]:
transformer = Transformer('BIGBIRD')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BIGBIRD', transformer=transformer)

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

----- Processing 128/128 test samples

Attention type 'block_sparse' is not possible if sequence_length: 100 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Training start!
Epoch: 1 batch: 1 / 2 loss: 0.7024 | 50.00%
----- Epoch 1 ------
Training loss: 0.6993421316146851
Training accuracy: 0.4921875
Validation loss: 0.7071637511253357
Validation accuracy: 0.5234375


Epoch: 2 batch: 1 / 2 loss: 0.6656 | 50.00%
----- Epoch 2 ------
Training loss: 0.6735682487487793
Training accuracy: 0.609375
Validation loss: 0.7331173419952393
Validation accuracy: 0.4609375


Epoch: 3 batch: 1 / 2 loss: 0.6634 | 50.00%
----- Epoch 3 ------
Training loss: 0.679786205291748
Training accuracy: 0.5703125
Validation loss: 0.7401235103607178
Validation accuracy: 0.4609375


Epoch: 4 batch: 1 / 2 loss: 0.6529 | 50.00%
----- Epoch 4 ------
Training loss: 0.6797059178352356
Training accuracy: 0.59375
Validation loss: 0.7393447160720825
Validation accuracy: 0.4609375


Epoch: 5 batch: 1 / 2 loss: 0.6484 | 50.00%
----- Epoch 5 ------
Training loss: 0.6630358695983887
Training accuracy: 0.5859375
Validation loss: 0.739662230014801
Validation accuracy: 0.4609375


Trai

**BERT + BiLSTM**

In [None]:
lstm = LSTM_attention()
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer, classifier_name='BiLSTM', classifier=lstm)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

----- Processing 128/128 test samplesTraining start!
Epoch: 1 batch: 1 / 2 loss: 0.6629 | 50.00%
----- Epoch 1 ------
Training loss: 0.6804267168045044
Training accuracy: 0.5703125
Validation loss: 0.7386375665664673
Validation accuracy: 0.4609375


Epoch: 2 batch: 1 / 2 loss: 0.5724 | 50.00%
----- Epoch 2 ------
Training loss: 0.608741044998169
Training accuracy: 0.65625
Validation loss: 0.6431618928909302
Validation accuracy: 0.6640625


Epoch: 3 batch: 1 / 2 loss: 0.5315 | 50.00%
----- Epoch 3 ------
Training loss: 0.5375389456748962
Training accuracy: 0.84375
Validation loss: 0.6325643062591553
Validation accuracy: 0.625


Epoch: 4 batch: 1 / 2 loss: 0.4714 | 50.00%
----- Epoch 4 ------
Training loss: 0.4598848819732666
Training accuracy: 0.875
Validation loss: 0.6059979200363159
Validation accuracy: 0.703125


Epoch: 5 batch: 1 / 2 loss: 0.3688 | 50.00%
----- Epoch 5 ------
Training loss: 0.3946110010147095
Training accuracy: 0.859375
Validation loss: 0.5971330404281616
Validation

**BERT + TextCNN**

In [None]:
textcnn = textCNN()
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer, classifier_name='TextCNN', classifier=textcnn)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samplesTraining start!
Epoch: 1 batch: 1 / 2 loss: 0.6764 | 50.00%
----- Epoch 1 ------
Training loss: 0.6851967573165894
Training accuracy: 0.5234375
Validation loss: 0.7435269355773926
Validation accuracy: 0.4609375


Epoch: 2 batch: 1 / 2 loss: 0.6012 | 50.00%
----- Epoch 2 ------
Training loss: 0.6101951599121094
Training accuracy: 0.6015625
Validation loss: 0.7028031349182129
Validation accuracy: 0.484375


Epoch: 3 batch: 1 / 2 loss: 0.5541 | 50.00%
----- Epoch 3 ------
Training loss: 0.564961314201355
Training accuracy: 0.796875
Validation loss: 0.6792197227478027
Validation accuracy: 0.5390625


Epoch: 4 batch: 1 / 2 loss: 0.5475 | 50.00%
----- Epoch 4 ------
Training loss: 0.5324174165725708
Training accuracy: 0.90625
Validation loss: 0.679407000541687
Validation accuracy: 0.5390625


Epoch: 5 batch: 1 / 2 loss: 0.5015 | 50.00%
----- Epoch 5 ------
Training loss: 0.5132081508636475
Training accuracy: 0.90625
Validation loss: 0.6791370511054993
Val

**Fine-tune BERT with large dataset**

In [13]:
# # load large dataset (~5 million sentences)

# data_large_train = pd.read_csv('dataset_binary_train_large.csv')[0:128]
# data_large_test = pd.read_csv('dataset_binary_test_large.csv')[0:128]

data_large_train = pd.read_csv('dataset_binary_train.csv')[0:128]
data_large_test = pd.read_csv('dataset_binary_test.csv')[0:128]

X_large_train, y_large_train = data_large_train.data.tolist(), data_large_train.label.tolist()
X_large_test, y_large_test = data_large_test.data.tolist(), data_large_test.label.tolist()

print('Train dataset length: {}'.format(len(X_large_train)))
print('Test dataset length: {}'.format(len(X_large_test)))

Train dataset length: 128
Test dataset length: 128


In [14]:
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 0.7261 | 50.00%
----- Epoch 1 ------
Training loss: 0.7873327732086182
Training accuracy: 0.453125
Validation loss: 0.6916887760162354
Validation accuracy: 0.5


Epoch: 2 batch: 1 / 2 loss: 0.6288 | 50.00%
----- Epoch 2 ------
Training loss: 0.6415930986404419
Training accuracy: 0.65625
Validation loss: 0.72811359167099
Validation accuracy: 0.46875


Epoch: 3 batch: 1 / 2 loss: 0.6299 | 50.00%
----- Epoch 3 ------
Training loss: 0.5988447666168213
Training accuracy: 0.6484375
Validation loss: 0.7456023097038269
Validation accuracy: 0.46875


Epoch: 4 batch: 1 / 2 loss: 0.6101 | 50.00%
----- Epoch 4 ------
Training loss: 0.5905438661575317
Training accuracy: 0.6328125
Validation loss: 0.7308262586593628
Validation accuracy: 0.484375


Epoch: 5 batch: 1 / 2 loss: 0.5157 | 50.00%
----- Epoch 5 ------
Training loss: 0.543961226940155
Training accuracy: 0.65625
Validation loss: 0.7192648649215698
Validation a

**Fine-tune BERT with multi-label data**

In [16]:
# # load multi-label dataset

data_multi_train = pd.read_csv('dataset_multi_num_train.csv')[0:128]
data_multi_test = pd.read_csv('dataset_multi_num_test.csv')[0:128]

X_multi_train, y_multi_train = data_multi_train.data.tolist(), data_multi_train.label.tolist()
X_multi_test, y_multi_test = data_multi_test.data.tolist(), data_multi_test.label.tolist()

print('Train dataset length: {}'.format(len(X_multi_train)))
print('Test dataset length: {}'.format(len(X_multi_test)))

Train dataset length: 128
Test dataset length: 128


In [17]:
transformer = Transformer('BERT', num_labels=5)

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

----- Processing 128/128 test samples

Training start!
Epoch: 1 batch: 1 / 2 loss: 1.5852 | 50.00%
----- Epoch 1 ------
Training loss: 1.6750962734222412
Training accuracy: 0.203125
Validation loss: 1.631754755973816
Validation accuracy: 0.4375


Epoch: 2 batch: 1 / 2 loss: 1.4894 | 50.00%
----- Epoch 2 ------
Training loss: 1.4906203746795654
Training accuracy: 0.515625
Validation loss: 1.555062174797058
Validation accuracy: 0.453125


Epoch: 3 batch: 1 / 2 loss: 1.4547 | 50.00%
----- Epoch 3 ------
Training loss: 1.3965202569961548
Training accuracy: 0.578125
Validation loss: 1.4769034385681152
Validation accuracy: 0.453125


Epoch: 4 batch: 1 / 2 loss: 1.2172 | 50.00%
----- Epoch 4 ------
Training loss: 1.2891900539398193
Training accuracy: 0.5859375
Validation loss: 1.4319487810134888
Validation accuracy: 0.453125


Epoch: 5 batch: 1 / 2 loss: 1.2884 | 50.00%
----- Epoch 5 ------
Training loss: 1.2508494853973389
Training accuracy: 0.5859375
Validation loss: 1.4134553670883179
Vali