# IMPORT, CONFIG

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
os.chdir('/content/drive/MyDrive/BERT_WOS')

In [3]:
!pip install transformers
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 12.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 62.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 56.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully i

In [4]:
from transformers import BertForSequenceClassification, AutoTokenizer, logging

import os
import random
import time
import re
from tqdm import tqdm
import pickle
import gensim

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
from torchinfo import summary

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report
from collections import defaultdict
from textwrap import wrap

import warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [5]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 8, 6

In [6]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED)
torch.cuda.random.manual_seed_all(RANDOM_SEED)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [7]:
DEVICE

device(type='cuda')

In [8]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [9]:
!nvidia-smi

Tue Nov 15 15:19:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# LOAD DATA

In [10]:
target_dir = "./output_bert"

In [11]:
X = list()
with open('./WOS/WOS46985/X.txt','r') as X_file:
    lines = X_file.readlines()
    for line in lines:
        X.append(line.strip())

In [12]:
Y = list()
with open('./WOS/WOS46985/Y.txt','r') as Y_file:
    lines = Y_file.readlines()
    for line in lines:
        Y.append(line.strip())

In [13]:
YL1 = list()
with open('./WOS/WOS46985/YL1.txt','r') as YL1_file:
    lines = YL1_file.readlines()
    for line in lines:
        YL1.append(line.strip())

In [14]:
YL2 = list()
with open('./WOS/WOS46985/YL2.txt','r') as YL2_file:
    lines = YL2_file.readlines()
    for line in lines:
        YL2.append(line.strip())

In [15]:
source = pd.DataFrame({'texts':pd.Series(X, dtype='object').apply(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x)),
 'Y':pd.Series(Y, dtype='float'),
 'YL1':pd.Series(YL1, dtype='float'),
 'YL2':pd.Series(YL2, dtype='float')})
source = source.astype({'Y': int, 'YL1': int, 'YL2':int})

In [16]:
source

Unnamed: 0,texts,Y,YL1,YL2
0,2 1dimensional nonlinear optical waves throug...,12,0,12
1,betaamyloid A beta and tau pathology become in...,74,5,2
2,Decreasing of energy consumption and environme...,68,4,7
3,Hybrid electric vehicles are assumed to play a...,26,1,10
4,L34Dihydroxyphenylalanine LDOPA remains the pr...,115,5,43
...,...,...,...,...
46980,Zusammenfassung Hintergrund Karate erfreut sic...,122,5,50
46981,ZWave is an implementation of home automation ...,15,0,15
46982,Zwitterionic peptides were anchored to a condu...,110,5,38
46983,ZY3 has been acquiring high quality imagery si...,10,0,10


In [17]:
source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46985 entries, 0 to 46984
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   texts   46985 non-null  object
 1   Y       46985 non-null  int64 
 2   YL1     46985 non-null  int64 
 3   YL2     46985 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.4+ MB


In [18]:
del X, Y, YL1, YL2 #,tokens

In [19]:
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

In [20]:
target_column = 'YL1'
data = source[['texts',target_column]] # source source[source['YL1']==0]

In [21]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=RANDOM_SEED,stratify=data[target_column].to_list())

In [22]:
len(train_df), len(test_df)

(31479, 15506)

In [23]:
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

# MODEL

In [24]:
class BertModel(nn.Module):
    def __init__(self, requires_grad = True):
        super(BertModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME,num_labels = len(data.groupby(target_column).count().index))
        self.tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)
        self.requires_grad = requires_grad
        self.device = torch.device(DEVICE)
        for param in self.bert.parameters():
            param.requires_grad = requires_grad  # Each parameter requires gradient

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, labels):
        loss, logits = self.bert(input_ids = batch_seqs, attention_mask = batch_seq_masks, 
                              token_type_ids=batch_seq_segments, labels = labels)[:2]
        probabilities = nn.functional.softmax(logits, dim=-1)
        return loss, logits, probabilities

In [25]:
bertmodel = BertModel(requires_grad = True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [26]:
summary(bertmodel)

Layer (type:depth-idx)                                       Param #
BertModel                                                    --
├─BertForSequenceClassification: 1-1                         --
│    └─BertModel: 2-1                                        --
│    │    └─BertEmbeddings: 3-1                              23,837,184
│    │    └─BertEncoder: 3-2                                 85,054,464
│    │    └─BertPooler: 3-3                                  590,592
│    └─Dropout: 2-2                                          --
│    └─Linear: 2-3                                           5,383
Total params: 109,487,623
Trainable params: 109,487,623
Non-trainable params: 0

In [27]:
bertmodel.bert.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [28]:
bertmodel.bert.classifier

Linear(in_features=768, out_features=7, bias=True)

In [29]:
model = bertmodel.to(DEVICE)

# PREPROCESSING

In [30]:
tokenizer = bertmodel.tokenizer

In [31]:
tokenizer.sep_token, tokenizer.sep_token_id

('[SEP]', 102)

In [32]:
tokenizer.cls_token, tokenizer.cls_token_id

('[CLS]', 101)

In [33]:
tokenizer.pad_token, tokenizer.pad_token_id

('[PAD]', 0)

# DATASET

In [34]:
class DataPrecessForSentence(Dataset):
    """
    Encoding sentences
    """
    def __init__(self, bert_tokenizer, df, max_seq_len = 50):
        super(DataPrecessForSentence, self).__init__()
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_seq_len
        self.input_ids, self.attention_mask, self.token_type_ids, self.labels = self.get_input(df)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.token_type_ids[idx], self.labels[idx]
        
    # Convert dataframe to tensor
    def get_input(self, df):
        sentences = df['texts'].values
        labels = df[target_column].values
        
        # tokenizer
        tokens_seq = list(map(self.bert_tokenizer.tokenize, sentences)) # list of shape [sentence_len, token_len]
        
        # Get fixed-length sequence and its mask
        result = list(map(self.trunate_and_pad, tokens_seq))
        
        input_ids = [i[0] for i in result]
        attention_mask = [i[1] for i in result]
        token_type_ids = [i[2] for i in result]
        
        return (
               torch.Tensor(input_ids).type(torch.long), 
               torch.Tensor(attention_mask).type(torch.long),
               torch.Tensor(token_type_ids).type(torch.long), 
               torch.Tensor(labels).type(torch.long)
               )
    
    
    def trunate_and_pad(self, tokens_seq):
        
        # Concat '[CLS]' at the beginning
        tokens_seq = ['[CLS]'] + tokens_seq     
        # Truncate sequences of which the lengths exceed the max_seq_len
        if len(tokens_seq) > self.max_seq_len:
            tokens_seq = tokens_seq[0 : self.max_seq_len]           
        # Generate padding
        padding = [0] * (self.max_seq_len - len(tokens_seq))       
        # Convert tokens_seq to token_ids
        input_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens_seq)
        input_ids += padding   
        # Create attention_mask
        attention_mask = [1] * len(tokens_seq) + padding     
        # Create token_type_ids
        token_type_ids = [0] * (self.max_seq_len)
        
        assert len(input_ids) == self.max_seq_len
        assert len(attention_mask) == self.max_seq_len
        assert len(token_type_ids) == self.max_seq_len
        
        return input_ids, attention_mask, token_type_ids

In [35]:
# sentences = (train_df['texts'].values, dev_df['texts'].values, test_df['texts'].values)
# sp = (list(map(tokenizer.tokenize, sentences[0])), list(map(tokenizer.tokenize, sentences[1])), list(map(tokenizer.tokenize, sentences[2])))
# len_sp = (list(map(len,sp[0])),list(map(len,sp[1])),list(map(len,sp[2])))
# max_sp = max(len_sp[0]), max(len_sp[1]), max(len_sp[2])
# if max(max_sp) > 512:
#     MAX_LEN = 512
# else:
#     MAX_LEM = max(max_sp)
# del sentences, sp, len_sp, max_sp
MAX_LEN = 512
MAX_LEN

512

In [36]:
BATCH_SIZE = 12

In [37]:
train_data = DataPrecessForSentence(tokenizer, train_df, max_seq_len = MAX_LEN)
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)

In [40]:
dev_data = DataPrecessForSentence(tokenizer,test_df, max_seq_len = MAX_LEN)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=BATCH_SIZE)

In [42]:
test_data = DataPrecessForSentence(tokenizer,test_df, max_seq_len = MAX_LEN)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

# TRAIN

In [43]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
LR = 2e-05
optimizer_grouped_parameters = [
    {
    'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':0.01
    },
    {
    'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':0.0
    }
]
optimizer = optim.AdamW(optimizer_grouped_parameters, lr=LR)

In [44]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0)

In [45]:
def Metric(y_true, y_pred):
    """
    compute and show the classification result
    """
    accuracy = accuracy_score(y_true, y_pred)
    macro_precision = precision_score(y_true, y_pred, average='macro')
    macro_recall = recall_score(y_true, y_pred, average='macro')
    weighted_f1 = f1_score(y_true, y_pred, average='macro')
    target_names = [f'class_{i}' for i in range(all_prob[0].shape[0])]
    report = classification_report(y_true, y_pred, target_names=target_names, digits=3)

    print('Accuracy: {:.1%}\nPrecision: {:.1%}\nRecall: {:.1%}\nF1: {:.1%}'.format(accuracy, macro_precision,
                                           macro_recall, weighted_f1))
    print("classification_report:\n")
    print(report)
  
  
def correct_predictions(output_probabilities, targets):
    """
    Compute the number of predictions that match some target classes in the
    output of a model.
    Args:
        output_probabilities: A tensor of probabilities for different output
            classes.
        targets: The indices of the actual target classes.
    Returns:
        The number of correct predictions in 'output_probabilities'.
    """
    _, out_classes = output_probabilities.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()


def train(model, dataloader, optimizer, epoch_number, max_gradient_norm):
    """
    Train a model for one epoch on some input data with a given optimizer and
    criterion.
    Args:
        model: A torch module that must be trained on some input data.
        dataloader: A DataLoader object to iterate over the training data.
        optimizer: A torch optimizer to use for training on the input model.
        epoch_number: The number of the epoch for which training is performed.
        max_gradient_norm: Max. norm for gradient norm clipping.
    Returns:
        epoch_time: The total time necessary to train the epoch.
        epoch_loss: The training loss computed for the epoch.
        epoch_accuracy: The accuracy computed for the epoch.
    """
    # Switch the model to train mode.
    model.train()
    device = model.device
    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0
    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate(tqdm_batch_iterator):
        batch_start = time.time()
        seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        loss, logits, probabilities = model(seqs, masks, segments, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        correct_preds += correct_predictions(probabilities, labels)
        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}"\
                      .format(batch_time_avg/(batch_index+1), running_loss/(batch_index+1))
        tqdm_batch_iterator.set_description(description)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / len(dataloader.dataset)
    return epoch_time, epoch_loss, epoch_accuracy


def validate(model, dataloader):
    """
    Compute the loss and accuracy of a model on some validation dataset.
    Args:
        model: A torch module for which the loss and accuracy must be
            computed.
        dataloader: A DataLoader object to iterate over the validation data.
    Returns:
        epoch_time: The total time to compute the loss and accuracy on the
            entire validation set.
        epoch_loss: The loss computed on the entire validation set.
        epoch_accuracy: The accuracy computed on the entire validation set.
        roc_auc_score(all_labels, all_prob): The auc computed on the entire validation set.
        all_prob: The probability of classification as label 1 on the entire validation set.
    """
    # Switch to evaluate mode.
    model.eval()
    device = model.device
    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            # Move input and output data to the GPU if one is used.
            seqs = batch_seqs.to(device)
            masks = batch_seq_masks.to(device)
            segments = batch_seq_segments.to(device)
            labels = batch_labels.to(device)
            loss, logits, probabilities = model(seqs, masks, segments, labels)
            running_loss += loss.item()
            running_accuracy += correct_predictions(probabilities, labels)
            all_prob.extend(probabilities.cpu().numpy())
            all_labels.extend(batch_labels)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = running_accuracy / (len(dataloader.dataset))
    return epoch_time, epoch_loss, epoch_accuracy, roc_auc_score(all_labels, all_prob,multi_class='ovr'), all_prob
    #return epoch_time, epoch_loss, epoch_accuracy, all_prob

def test(model, dataloader):
    """
    Test the accuracy of a model on some labelled test dataset.
    Args:
        model: The torch module on which testing must be performed.
        dataloader: A DataLoader object to iterate over some dataset.
    Returns:
        batch_time: The average time to predict the classes of a batch.
        total_time: The total time to process the whole dataset.
        accuracy: The accuracy of the model on the input data.
        all_prob: The probability of classification as label 1 on the entire validation set.
    """
    # Switch the model to eval mode.
    model.eval()
    device = model.device
    time_start = time.time()
    batch_time = 0.0
    accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            _, _, probabilities = model(seqs, masks, segments, labels)
            accuracy += correct_predictions(probabilities, labels)
            batch_time += time.time() - batch_start
            all_prob.extend(probabilities.cpu().numpy())
            all_labels.extend(batch_labels)
    batch_time /= len(dataloader)
    total_time = time.time() - time_start
    accuracy /= (len(dataloader.dataset))

    return batch_time, total_time, accuracy, all_prob

In [46]:
best_score = 0.0
epochs= 7
start_epoch = 1
patience = 1
max_grad_norm = 10.0
if_save_model = True
checkpoint = None

# Data for loss curves plot
epochs_count = []
train_losses = []
train_accuracies = []
valid_losses = []
valid_accuracies = []
valid_aucs = []

if checkpoint:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint["epoch"] + 1
    best_score = checkpoint["best_score"]
    print("\t* Training will continue on existing model from epoch {}...".format(start_epoch))
    model.load_state_dict(checkpoint["model"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    epochs_count = checkpoint["epochs_count"]
    train_losses = checkpoint["train_losses"]
    train_accuracy = checkpoint["train_accuracy"]
    valid_losses = checkpoint["valid_losses"]
    valid_accuracy = checkpoint["valid_accuracy"]
    valid_auc = checkpoint["valid_auc"]

 # Compute loss and accuracy before starting (or resuming) training.
_, valid_loss, valid_accuracy, auc, _ = validate(model, dev_loader)
print("\n* Validation loss before training: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}".format(valid_loss, (valid_accuracy*100), auc))

# -------------------- Training epochs -----------------------------------#

print("\n", 20 * "=", "Training bert model on device: {}".format(DEVICE), 20 * "=")
patience_counter = 0
for epoch in range(start_epoch, epochs + 1):
    epochs_count.append(epoch)

    print("* Training epoch {}:".format(epoch))
    epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, epoch, max_grad_norm)
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)  
    print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".format(epoch_time, epoch_loss, (epoch_accuracy*100)))

    print("* Validation for epoch {}:".format(epoch))
    epoch_time, epoch_loss, epoch_accuracy , epoch_auc, _, = validate(model, dev_loader)
    valid_losses.append(epoch_loss)
    valid_accuracies.append(epoch_accuracy)
    valid_aucs.append(epoch_auc)
    print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%, auc: {:.4f}\n"
          .format(epoch_time, epoch_loss, (epoch_accuracy*100), epoch_auc))

    # Update the optimizer's learning rate with the scheduler.
    scheduler.step(epoch_accuracy)
    ## scheduler.step()

    # Early stopping on validation accuracy.
    if epoch_accuracy < best_score:
        patience_counter += 1
    else:
        best_score = epoch_accuracy
        patience_counter = 0
        if (if_save_model):
            torch.save({"epoch": epoch, 
                       "model": model.state_dict(),
                       "optimizer": optimizer.state_dict(),
                       "best_score": best_score,
                       "epochs_count": epochs_count,
                       "train_losses": train_losses,
                       "train_accuracy": train_accuracies,
                       "valid_losses": valid_losses,
                       "valid_accuracy": valid_accuracies,
                       "valid_auc": valid_aucs
                       },
                       os.path.join(target_dir, "best.pth.tar"))
            print("save model succesfully!\n")

        # run model on test set and save the prediction result to csv
        print("* Test for epoch {}:".format(epoch))
        _, _, test_accuracy, _, all_prob = validate(model, test_loader)
        print("Test accuracy: {:.4f}%\n".format(test_accuracy))
        columns_names = [f'prob_{i}' for i in range(all_prob[0].shape[0])]
        test_prediction = pd.DataFrame(all_prob,columns=columns_names)
        test_prediction['prediction'] = test_prediction.apply(lambda x: columns_names.index(x.idxmax()) , axis=1)
        test_prediction = test_prediction[[*columns_names, 'prediction']]
        test_prediction.to_csv(os.path.join(target_dir,"test_prediction.csv"), index=False,sep=';')

    if patience_counter >= patience:
        print("-> Early stopping: patience limit reached, stopping...")
        break


* Validation loss before training: 1.9543, accuracy: 13.1046%, auc: 0.5341

* Training epoch 1:


Avg. batch proc. time: 1.1419s, loss: 0.5269: 100%|██████████| 2624/2624 [50:17<00:00,  1.15s/it]


-> Training time: 3017.7690s, loss = 0.5269, accuracy: 82.3787%
* Validation for epoch 1:
-> Valid. time: 540.3349s, loss: 0.3239, accuracy: 89.2945%, auc: 0.9890

save model succesfully!

* Test for epoch 1:
Test accuracy: 0.8929%

* Training epoch 2:


Avg. batch proc. time: 1.1402s, loss: 0.2708: 100%|██████████| 2624/2624 [50:13<00:00,  1.15s/it]


-> Training time: 3013.1779s, loss = 0.2708, accuracy: 91.2132%
* Validation for epoch 2:
-> Valid. time: 541.5907s, loss: 0.3338, accuracy: 89.1397%, auc: 0.9905

-> Early stopping: patience limit reached, stopping...


In [47]:
test_result = pd.read_csv(os.path.join(target_dir, 'test_prediction.csv'),sep=';')

In [48]:
Metric(test_df[target_column], test_result.prediction) 

Accuracy: 89.3%
Precision: 89.8%
Recall: 88.9%
F1: 89.3%
classification_report:

              precision    recall  f1-score   support

     class_0      0.943     0.913     0.928      2150
     class_1      0.889     0.953     0.920      1809
     class_2      0.873     0.815     0.843      2357
     class_3      0.900     0.871     0.886      1088
     class_4      0.897     0.903     0.900      1398
     class_5      0.875     0.920     0.897      4827
     class_6      0.907     0.846     0.875      1877

    accuracy                          0.893     15506
   macro avg      0.898     0.889     0.893     15506
weighted avg      0.893     0.893     0.893     15506



# PREDICTION

In [49]:
def model_load_test(test_df, target_dir, test_prediction_dir, test_prediction_name, max_seq_len=50, batch_size=32):
    """
    Parameters
    ----------
    test_df : pandas dataframe of test set.
    target_dir : the path of pretrained model.
    test_prediction_dir : the path that you want to save the prediction result to.
    test_prediction_name : the file name of the prediction result.
    max_seq_len: the max truncated length.
    batch_size : the default is 32.
    
    """
    
    bertmodel = BertModel(requires_grad = False)
    tokenizer = bertmodel.tokenizer
    device = torch.device("cuda")
    
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(os.path.join(target_dir, "best.pth.tar"))
    else:
        checkpoint = torch.load(os.path.join(target_dir, "best.pth.tar"), map_location=device)
        
    print("\t* Loading test data...")    
    test_data = DataPrecessForSentence(tokenizer,test_df, max_seq_len = max_seq_len) 
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

    # Retrieving model parameters from checkpoint.
    print("\t* Building model...")
    model = bertmodel.to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing BERT model on device: {} ".format(device), 20 * "=")
    
    batch_time, total_time, accuracy, all_prob = test(model, test_loader)
    print("\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n".format(batch_time, total_time, (accuracy*100)))
    
    columns_names = [f'prob_{i}' for i in range(all_prob[0].shape[0])]
    test_prediction = pd.DataFrame(all_prob,columns=columns_names)
    test_prediction['prediction'] = test_prediction.apply(lambda x: columns_names.index(x.idxmax()) , axis=1)
    if not os.path.exists(test_prediction_dir):
        os.makedirs(test_prediction_dir)
    test_prediction.to_csv(os.path.join(test_prediction_dir, test_prediction_name), index=False)

In [50]:
platform = 'Windows'
model_load_test(test_df, target_dir, target_dir, "test_prediction2.csv", max_seq_len=MAX_LEN, batch_size=BATCH_SIZE)

	* Loading test data...
	* Building model...

-> Average batch processing time: 0.4187s, total test time: 542.0285s, accuracy: 89.2945%

