In [None]:
!pip install Sentencepiece
!pip install transformers

Collecting Sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 2.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 4.2 MB/s eta 0:00:01[K     |▉                               | 30 kB 4.6 MB/s eta 0:00:01[K     |█                               | 40 kB 5.4 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.9 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.5 MB/s eta 0:00:01[K     |██                              | 71 kB 5.7 MB/s eta 0:00:01[K     |██▏                             | 81 kB 6.5 MB/s eta 0:00:01[K     |██▍                             | 92 kB 6.5 MB/s eta 0:00:01[K     |██▊                             | 102 kB 6.1 MB/s eta 0:00:01[K     |███                             | 112 kB 6.1 MB/s eta 0:00:01[K     |███▎                            | 122 kB 6.1 MB/s eta 0:00:01[K     |███▌          

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BigBirdTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BigBirdForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import nltk

from models import *

In [None]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('drive/MyDrive/machine_learning')

In [None]:
# # load dataset

data_train = pd.read_csv('dataset_binary_train.csv')
data_test = pd.read_csv('dataset_binary_test.csv')

X_train, y_train = data_train.data.tolist(), data_train.label.tolist()
X_test, y_test = data_test.data.tolist(), data_test.label.tolist()

print('Train dataset length: {}'.format(len(X_train)))
print('Test dataset length: {}'.format(len(X_test)))

Train dataset length: 1765172
Test dataset length: 217923


In [None]:
def train_val(train_dataset, test_dataset, transformer_name, transformer, classifier_name=None, classifier=None, lr_transformer=3e-5, lr_classifier=1e-3, batch_size=64, max_epoch=5):
    # create dataloader for tensor dataset
    train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
    val_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)
    
    # define device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # use cuda for transformer
    transformer = transformer.model.to(device)
    
    # define models
    if classifier is not None:
        classifier = classifier.to(device)
        optimizer = torch.optim.Adam([{"params": classifier.parameters(), 'lr': lr_classifier}])
        for p in transformer.parameters(): # freeze the layers of transformer
            p.requires_grad = False
    else:
        optimizer = torch.optim.Adam(transformer.parameters(), lr = lr_transformer) # the learning rate is suggested by the authors

        for p in transformer.parameters():
            p.requires_grad = True

    
    # Hyper-parameters
    max_epoch = 5
    n_batch = int(len(train_dataset)/batch_size)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_dataloader) * max_epoch)
    criterion = F.cross_entropy
    
    # clean memory in GPU
    torch.cuda.empty_cache()
    
    # a list to record the state of training
    training_stats = []


            
    print('Training start!')
    for e in range(max_epoch):
        
        # train model
        #model.train()
        if classifier is not None:
            classifier.train()
        else:
            transformer.train()
        
        epoch_loss = 0
        train_acc = 0
        
        for b, (x_id, x_mask, y) in enumerate(train_dataloader):
            x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
            
            optimizer.zero_grad()
            
            if classifier is not None:
                with torch.no_grad():
                    word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                logits = classifier(word_embedding)
                loss = criterion(logits, y)
            else:
                outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                loss, logits = outputs['loss'], outputs['logits']


            epoch_loss += loss
            train_acc += (logits.max(1)[1] == y).float().mean().item()
            
            loss.backward()
            
            #clip gradient
            if classifier is None:
                torch.nn.utils.clip_grad_norm_(transformer.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            print("\rEpoch: {:d} batch: {:d} / {} loss: {:.4f} | {:.2%}".format(e + 1, b, n_batch, loss, b*1.0/n_batch), end='', flush=True)
        print("\n----- Epoch {} ------\nTraining loss: {}".format(e+1, epoch_loss / len(train_dataloader)))
        print("Training accuracy: {}".format(train_acc / len(train_dataloader)))

        
        # evaluate model
        if classifier is not None:
            classifier.eval()
        transformer.eval()
        
        eval_acc = 0
        eval_loss = 0
        nb_eval_steps = 0
        
        for b, (x_id, x_mask, y) in enumerate(val_dataloader):
            x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
            
            with torch.no_grad():
                if classifier is not None:
                    word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                    logits = classifier(word_embedding)
                    loss = criterion(logits, y)
                else:
                    outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                    loss, logits = outputs['loss'], outputs['logits']
            
            eval_loss += loss
            eval_acc += (logits.max(1)[1] == y).float().mean().item()

        print("Validation loss: {}".format(eval_loss / len(val_dataloader)))
        print("Validation accuracy: {}".format(eval_acc / len(val_dataloader)))
        print("\n")
        
        training_stats.append(
            {
                'epoch': e+1,
                'train_loss': epoch_loss / len(train_dataloader),
                'train_acc': train_acc / len(train_dataloader),
                'val_loss': eval_loss / len(val_dataloader),
                'val_acc': eval_acc / len(val_dataloader),
            }
        )

        # save models
        if classifier is not None:
            torch.save(classifier, '{}-{}.pkl'.format(transformer_name, classifier_name))
        else:
            torch.save(transformer, '{}.pkl'.format(transformer_name))
        
        # save states of training
        np.save('{}-{}-train_stats_Epoch{}.npy'.format(transformer_name, classifier_name, e+1), training_stats) 

    print('Training complete!')

**Fine-tune BERT**

In [None]:
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

----- Processing 1641119/1765172 train samples

**Fine-tune GPT2**

In [None]:
transformer = Transformer('GPT2')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='GPT2', transformer=transformer)

**Fine-tune BIGBIRD**

In [None]:
transformer = Transformer('BIGBIRD')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BIGBIRD', transformer=transformer)

**BERT + BiLSTM**

In [None]:
lstm = LSTM_attention()
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer, classifier_name='BiLSTM', classifier=lstm)

**BERT + TextCNN**

In [None]:
textcnn = textCNN()
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_train, X_test, y_train, y_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer, classifier_name='TextCNN', classifier=textcnn)

**Fine-tune BERT with large dataset**

In [None]:
# # load large dataset (~5 million sentences)

data_large_train = pd.read_csv('dataset_binary_train_large.csv')
data_large_test = pd.read_csv('dataset_binary_test_large.csv')


X_large_train, y_large_train = data_large_train.data.tolist(), data_large_train.label.tolist()
X_large_test, y_large_test = data_large_test.data.tolist(), data_large_test.label.tolist()

print('Train dataset length: {}'.format(len(X_large_train)))
print('Test dataset length: {}'.format(len(X_large_test)))

In [None]:
transformer = Transformer('BERT')

train_dataset, test_dataset = transformer.preprocess_data(X_large_train, X_large_test, y_large_train, y_large_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)

**Fine-tune BERT with multi-label data**

In [None]:
# # load multi-label dataset

data_multi_train = pd.read_csv('dataset_multi_num_train.csv')
data_multi_test = pd.read_csv('dataset_multi_num_test.csv')

X_multi_train, y_multi_train = data_multi_train.data.tolist(), data_multi_train.label.tolist()
X_multi_test, y_multi_test = data_multi_test.data.tolist(), data_multi_test.label.tolist()

print('Train dataset length: {}'.format(len(X_multi_train)))
print('Test dataset length: {}'.format(len(X_multi_test)))

In [None]:
transformer = Transformer('BERT', num_labels=5)

train_dataset, test_dataset = transformer.preprocess_data(X_multi_train, X_multi_test, y_multi_train, y_multi_test)
train_val(train_dataset, test_dataset, transformer_name='BERT', transformer=transformer)