# IMPORT, CONFIG

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import BertForSequenceClassification, AutoTokenizer,logging

import os
import random
import time
from tqdm import tqdm
import pickle
import gensim

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
from torchinfo import summary

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, classification_report
from collections import defaultdict
from textwrap import wrap

import warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 8, 6

In [5]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED)
torch.cuda.random.manual_seed_all(RANDOM_SEED)
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [6]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

# LOAD DATA

In [7]:
os.chdir('/content/drive/MyDrive/Professional11')

In [8]:
target_dir = "./output_bert_Ohsumed"

In [9]:
train_data = pd.read_csv('./train.csv')

In [10]:
train_data

Unnamed: 0,id,review_text,thumbs_up,score
0,0,It provides latest informations. Easy to use. ...,0,5
1,1,Good service.,0,4
2,2,Looks great and simple to use.,0,4
3,3,"Clear stats, everthing you need to know at a g...",0,4
4,4,Very good app,0,5
...,...,...,...,...
127628,127628,5yr/ longer timeframe charts please.,1,2
127629,127629,My morning app.,0,5
127630,127630,Changes in the portfolio isn't working,2,2
127631,127631,"Literal scum company, screwing over the little...",22,1


In [45]:
test_data_source = pd.read_csv('./test.csv')

In [46]:
test_data_source

Unnamed: 0,id,review_text,thumbs_up
0,127633,I didnt expect it to be real. But just played ...,0
1,127634,"Ver convinent app, good information and news.",2
2,127635,Plots of info and stock proces for free,1
3,127636,"Easy to use, and very helpful for maintaining ...",52
4,127637,"Easy to use, lots of information.",1
...,...,...,...
31904,159537,"I lke Mint a lot, but for me it does not provi...",1
31905,159538,Why do you ask for my phone number?,0
31906,159539,"Great for keeping up with stocks, I use it daily.",0
31907,159540,gives relevant market news. easy to follow.,2


In [13]:
target_column = 'score'

In [14]:
train_df, non_train_df = train_test_split(train_data, test_size=0.2, random_state=RANDOM_SEED,stratify=train_data[target_column].to_list())

In [15]:
dev_df, test_df = train_test_split(non_train_df, test_size=0.5, random_state=RANDOM_SEED,stratify=non_train_df[target_column].to_list())

In [16]:
len(train_df), len(dev_df), len(test_df)

(102106, 12763, 12764)

In [17]:
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

# Model

In [18]:
class BertModel(nn.Module):
    def __init__(self, requires_grad = False):
        super(BertModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME,num_labels = len(train_data.groupby('score').count().index))
        self.tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)
        self.requires_grad = requires_grad
        self.device = torch.device(DEVICE)
        for param in self.bert.parameters():
            param.requires_grad = requires_grad  # Each parameter requires gradient

    def forward(self, batch_seqs, batch_seq_masks, batch_seq_segments, labels):
        loss, logits = self.bert(input_ids = batch_seqs, attention_mask = batch_seq_masks, 
                              token_type_ids=batch_seq_segments, labels = labels)[:2]
        probabilities = nn.functional.softmax(logits, dim=-1)
        return loss, logits, probabilities

In [19]:
bertmodel = BertModel(requires_grad = False)

In [20]:
summary(bertmodel)

Layer (type:depth-idx)                                       Param #
BertModel                                                    --
├─BertForSequenceClassification: 1-1                         --
│    └─BertModel: 2-1                                        --
│    │    └─BertEmbeddings: 3-1                              (23,837,184)
│    │    └─BertEncoder: 3-2                                 (85,054,464)
│    │    └─BertPooler: 3-3                                  (590,592)
│    └─Dropout: 2-2                                          --
│    └─Linear: 2-3                                           (4,614)
Total params: 109,486,854
Trainable params: 0
Non-trainable params: 109,486,854

In [21]:
for name, param in bertmodel.bert.named_parameters():
    if name == 'classifier.weight' or name == 'classifier.bias':
        param.requires_grad = True

In [22]:
bertmodel.bert.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [23]:
bertmodel.bert.classifier

Linear(in_features=768, out_features=6, bias=True)

In [24]:
model = bertmodel.to(DEVICE)

# PREPROCESSING

In [25]:
tokenizer = bertmodel.tokenizer

In [26]:
tokenizer.sep_token, tokenizer.sep_token_id

('[SEP]', 102)

In [27]:
tokenizer.cls_token, tokenizer.cls_token_id

('[CLS]', 101)

In [28]:
tokenizer.pad_token, tokenizer.pad_token_id

('[PAD]', 0)

# DATASET

In [29]:
class DataPrecessForSentence(Dataset):
    """
    Encoding sentences
    """
    def __init__(self, bert_tokenizer, df, max_seq_len = 50):
        super(DataPrecessForSentence, self).__init__()
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_seq_len
        self.input_ids, self.attention_mask, self.token_type_ids, self.labels = self.get_input(df)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.token_type_ids[idx], self.labels[idx]
        
    # Convert dataframe to tensor
    def get_input(self, df):
        sentences = df['review_text'].values
        labels = df[target_column].values
        
        # tokenizer
        tokens_seq = list(map(self.bert_tokenizer.tokenize, sentences)) # list of shape [sentence_len, token_len]
        
        # Get fixed-length sequence and its mask
        result = list(map(self.trunate_and_pad, tokens_seq))
        
        input_ids = [i[0] for i in result]
        attention_mask = [i[1] for i in result]
        token_type_ids = [i[2] for i in result]
        
        return (
               torch.Tensor(input_ids).type(torch.long), 
               torch.Tensor(attention_mask).type(torch.long),
               torch.Tensor(token_type_ids).type(torch.long), 
               torch.Tensor(labels).type(torch.long)
               )
    
    
    def trunate_and_pad(self, tokens_seq):
        
        # Concat '[CLS]' at the beginning
        tokens_seq = ['[CLS]'] + tokens_seq     
        # Truncate sequences of which the lengths exceed the max_seq_len
        if len(tokens_seq) > self.max_seq_len:
            tokens_seq = tokens_seq[0 : self.max_seq_len]           
        # Generate padding
        padding = [0] * (self.max_seq_len - len(tokens_seq))       
        # Convert tokens_seq to token_ids
        input_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens_seq)
        input_ids += padding   
        # Create attention_mask
        attention_mask = [1] * len(tokens_seq) + padding     
        # Create token_type_ids
        token_type_ids = [0] * (self.max_seq_len)
        
        assert len(input_ids) == self.max_seq_len
        assert len(attention_mask) == self.max_seq_len
        assert len(token_type_ids) == self.max_seq_len
        
        return input_ids, attention_mask, token_type_ids

In [30]:
# sentences = (train_df['review_text'].values, dev_df['review_text'].values, test_df['review_text'].values)
# sp = (list(map(tokenizer.tokenize, sentences[0])), list(map(tokenizer.tokenize, sentences[1])), list(map(tokenizer.tokenize, sentences[2])))
# len_sp = (list(map(len,sp[0])),list(map(len,sp[1])),list(map(len,sp[2])))
# max_sp = max(len_sp[0]), max(len_sp[1]), max(len_sp[2])
# if max(max_sp) > 512:
#     MAX_LEN = 512
# else:
#     MAX_LEM = max(max_sp)
# del sentences, sp, len_sp, max_sp
MAX_LEN = 512
MAX_LEN

512

In [31]:
BATCH_SIZE = 150

In [32]:
train_data = DataPrecessForSentence(tokenizer, train_df, max_seq_len = MAX_LEN)
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)

In [33]:
dev_data = DataPrecessForSentence(tokenizer,dev_df, max_seq_len = MAX_LEN)
dev_loader = DataLoader(dev_data, shuffle=True, batch_size=BATCH_SIZE)

In [34]:
test_data = DataPrecessForSentence(tokenizer,test_df, max_seq_len = MAX_LEN)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

# TRAIN

In [35]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
LR = 2e-05
optimizer_grouped_parameters = [
    {
    'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':0.01
    },
    {
    'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':0.0
    }
]
optimizer = optim.AdamW(optimizer_grouped_parameters, lr=LR)

In [36]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0)

In [55]:
def Metric(y_true, y_pred):
    """
    compute and show the classification result
    """
    accuracy = accuracy_score(y_true, y_pred)
    macro_precision = precision_score(y_true, y_pred, average='macro')
    macro_recall = recall_score(y_true, y_pred, average='macro')
    weighted_f1 = f1_score(y_true, y_pred, average='macro')
    target_names = [f'class_{i}' for i in range(all_prob[0].shape[0])]
    report = classification_report(y_true, y_pred, target_names=target_names, digits=3)

    print('Accuracy: {:.1%}\nPrecision: {:.1%}\nRecall: {:.1%}\nF1: {:.1%}'.format(accuracy, macro_precision,
                                           macro_recall, weighted_f1))
    print("classification_report:\n")
    print(report)
  
  
def correct_predictions(output_probabilities, targets):
    """
    Compute the number of predictions that match some target classes in the
    output of a model.
    Args:
        output_probabilities: A tensor of probabilities for different output
            classes.
        targets: The indices of the actual target classes.
    Returns:
        The number of correct predictions in 'output_probabilities'.
    """
    _, out_classes = output_probabilities.max(dim=1)
    correct = (out_classes == targets).sum()
    return correct.item()


def train(model, dataloader, optimizer, epoch_number, max_gradient_norm):
    """
    Train a model for one epoch on some input data with a given optimizer and
    criterion.
    Args:
        model: A torch module that must be trained on some input data.
        dataloader: A DataLoader object to iterate over the training data.
        optimizer: A torch optimizer to use for training on the input model.
        epoch_number: The number of the epoch for which training is performed.
        max_gradient_norm: Max. norm for gradient norm clipping.
    Returns:
        epoch_time: The total time necessary to train the epoch.
        epoch_loss: The training loss computed for the epoch.
        epoch_accuracy: The accuracy computed for the epoch.
    """
    # Switch the model to train mode.
    model.train()
    device = model.device
    epoch_start = time.time()
    batch_time_avg = 0.0
    running_loss = 0.0
    correct_preds = 0
    tqdm_batch_iterator = tqdm(dataloader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate(tqdm_batch_iterator):
        batch_start = time.time()
        seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        loss, logits, probabilities = model(seqs, masks, segments, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step()
        batch_time_avg += time.time() - batch_start
        running_loss += loss.item()
        correct_preds += correct_predictions(probabilities, labels)
        description = "Avg. batch proc. time: {:.4f}s, loss: {:.4f}"\
                      .format(batch_time_avg/(batch_index+1), running_loss/(batch_index+1))
        tqdm_batch_iterator.set_description(description)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = correct_preds / len(dataloader.dataset)
    return epoch_time, epoch_loss, epoch_accuracy


def validate(model, dataloader):
    """
    Compute the loss and accuracy of a model on some validation dataset.
    Args:
        model: A torch module for which the loss and accuracy must be
            computed.
        dataloader: A DataLoader object to iterate over the validation data.
    Returns:
        epoch_time: The total time to compute the loss and accuracy on the
            entire validation set.
        epoch_loss: The loss computed on the entire validation set.
        epoch_accuracy: The accuracy computed on the entire validation set.
        roc_auc_score(all_labels, all_prob): The auc computed on the entire validation set.
        all_prob: The probability of classification as label 1 on the entire validation set.
    """
    # Switch to evaluate mode.
    model.eval()
    device = model.device
    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            # Move input and output data to the GPU if one is used.
            seqs = batch_seqs.to(device)
            masks = batch_seq_masks.to(device)
            segments = batch_seq_segments.to(device)
            labels = batch_labels.to(device)
            loss, logits, probabilities = model(seqs, masks, segments, labels)
            running_loss += loss.item()
            running_accuracy += correct_predictions(probabilities, labels)
            all_prob.extend(probabilities.cpu().numpy())
            all_labels.extend(batch_labels)
    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / len(dataloader)
    epoch_accuracy = running_accuracy / (len(dataloader.dataset))
    return epoch_time, epoch_loss, epoch_accuracy, all_prob

def test(model, dataloader):
    """
    Test the accuracy of a model on some labelled test dataset.
    Args:
        model: The torch module on which testing must be performed.
        dataloader: A DataLoader object to iterate over some dataset.
    Returns:
        batch_time: The average time to predict the classes of a batch.
        total_time: The total time to process the whole dataset.
        accuracy: The accuracy of the model on the input data.
        all_prob: The probability of classification as label 1 on the entire validation set.
    """
    # Switch the model to eval mode.
    model.eval()
    device = model.device
    time_start = time.time()
    batch_time = 0.0
    accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            _, _, probabilities = model(seqs, masks, segments, labels)
            accuracy += correct_predictions(probabilities, labels)
            batch_time += time.time() - batch_start
            all_prob.extend(probabilities.cpu().numpy())
            all_labels.extend(batch_labels)
    batch_time /= len(dataloader)
    total_time = time.time() - time_start
    accuracy /= (len(dataloader.dataset))

    return batch_time, total_time, accuracy, all_prob

In [38]:
best_score = 0.0
epochs=10
start_epoch = 1
patience = 1
max_grad_norm = 10.0
if_save_model = True
checkpoint = None

# Data for loss curves plot
epochs_count = []
train_losses = []
train_accuracies = []
valid_losses = []
valid_accuracies = []

if checkpoint:
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint["epoch"] + 1
    best_score = checkpoint["best_score"]
    print("\t* Training will continue on existing model from epoch {}...".format(start_epoch))
    model.load_state_dict(checkpoint["model"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    epochs_count = checkpoint["epochs_count"]
    train_losses = checkpoint["train_losses"]
    train_accuracy = checkpoint["train_accuracy"]
    valid_losses = checkpoint["valid_losses"]
    valid_accuracy = checkpoint["valid_accuracy"]

 # Compute loss and accuracy before starting (or resuming) training.
_, valid_loss, valid_accuracy,  _, = validate(model, dev_loader)
print("\n* Validation loss before training: {:.4f}, accuracy: {:.4f}%".format(valid_loss, (valid_accuracy*100)))

# -------------------- Training epochs -----------------------------------#

print("\n", 20 * "=", "Training bert model on device: {}".format(DEVICE), 20 * "=")
patience_counter = 0
for epoch in range(start_epoch, epochs + 1):
    epochs_count.append(epoch)

    print("* Training epoch {}:".format(epoch))
    epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, epoch, max_grad_norm)
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)  
    print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".format(epoch_time, epoch_loss, (epoch_accuracy*100)))

    print("* Validation for epoch {}:".format(epoch))
    epoch_time, epoch_loss, epoch_accuracy, _, = validate(model, dev_loader)
    valid_losses.append(epoch_loss)
    valid_accuracies.append(epoch_accuracy)
    print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n"
          .format(epoch_time, epoch_loss, (epoch_accuracy*100)))

    # Update the optimizer's learning rate with the scheduler.
    scheduler.step(epoch_accuracy)
    ## scheduler.step()

    # Early stopping on validation accuracy.
    if epoch_accuracy < best_score:
        patience_counter += 1
    else:
        best_score = epoch_accuracy
        patience_counter = 0
        if (if_save_model):
            torch.save({"epoch": epoch, 
                       "model": model.state_dict(),
                       "optimizer": optimizer.state_dict(),
                       "best_score": best_score,
                       "epochs_count": epochs_count,
                       "train_losses": train_losses,
                       "train_accuracy": train_accuracies,
                       "valid_losses": valid_losses,
                       "valid_accuracy": valid_accuracies,
                       },
                       os.path.join(target_dir, "best.pth.tar"))
            print("save model succesfully!\n")

        # run model on test set and save the prediction result to csv
        print("* Test for epoch {}:".format(epoch))
        _, _, test_accuracy, all_prob = validate(model, test_loader)
        print("Test accuracy: {:.4f}%\n".format(test_accuracy))
        columns_names = [f'prob_{i}' for i in range(all_prob[0].shape[0])]
        test_prediction = pd.DataFrame(all_prob,columns=columns_names)
        test_prediction['prediction'] = test_prediction.apply(lambda x: columns_names.index(x.idxmax()) , axis=1)
        test_prediction = test_prediction[[*columns_names, 'prediction']]
        test_prediction.to_csv(os.path.join(target_dir,"test_prediction.csv"), index=False,sep=';')

    if patience_counter >= patience:
        print("-> Early stopping: patience limit reached, stopping...")
        break


* Validation loss before training: 1.6113, accuracy: 52.3701%

* Training epoch 1:


Avg. batch proc. time: 0.0172s, loss: 1.3439: 100%|██████████| 681/681 [58:30<00:00,  5.16s/it]


-> Training time: 3510.9296s, loss = 1.3439, accuracy: 53.5532%
* Validation for epoch 1:
-> Valid. time: 417.6336s, loss: 1.2399, accuracy: 54.5013%

save model succesfully!

* Test for epoch 1:
Test accuracy: 0.5451%

* Training epoch 2:


Avg. batch proc. time: 0.0168s, loss: 1.2447: 100%|██████████| 681/681 [58:27<00:00,  5.15s/it]


-> Training time: 3507.6975s, loss = 1.2447, accuracy: 54.4131%
* Validation for epoch 2:
-> Valid. time: 416.6478s, loss: 1.2060, accuracy: 54.6502%

save model succesfully!

* Test for epoch 2:
Test accuracy: 0.5468%

* Training epoch 3:


Avg. batch proc. time: 0.0172s, loss: 1.2149: 100%|██████████| 681/681 [58:29<00:00,  5.15s/it]


-> Training time: 3509.4676s, loss = 1.2149, accuracy: 54.7108%
* Validation for epoch 3:
-> Valid. time: 417.8606s, loss: 1.1844, accuracy: 55.2691%

save model succesfully!

* Test for epoch 3:
Test accuracy: 0.5529%

* Training epoch 4:


Avg. batch proc. time: 0.0168s, loss: 1.1996:  32%|███▏      | 221/681 [19:04<39:43,  5.18s/it]


KeyboardInterrupt: ignored

In [40]:
test_result = pd.read_csv(os.path.join(target_dir, 'test_prediction.csv'),sep=';')

In [41]:
Metric(test_df[target_column], test_result.prediction) 

Accuracy: 55.3%
Precision: 16.1%
Recall: 17.8%
F1: 13.9%
classification_report:

              precision    recall  f1-score   support

     class_0      0.000     0.000     0.000         5
     class_1      0.407     0.072     0.123      1760
     class_2      0.000     0.000     0.000       715
     class_3      0.000     0.000     0.000       885
     class_4      0.000     0.000     0.000      2439
     class_5      0.557     0.996     0.714      6960

    accuracy                          0.553     12764
   macro avg      0.161     0.178     0.139     12764
weighted avg      0.360     0.553     0.406     12764



In [None]:
test_data

In [72]:
class DataPrecessForSentenceTest(Dataset):
    """
    Encoding sentences
    """
    def __init__(self, bert_tokenizer, df, max_seq_len = 50):
        super(DataPrecessForSentenceTest, self).__init__()
        self.bert_tokenizer = bert_tokenizer
        self.max_seq_len = max_seq_len
        self.input_ids, self.attention_mask, self.token_type_ids, self.labels = self.get_input(df)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.token_type_ids[idx], self.labels[idx]
        
    # Convert dataframe to tensor
    def get_input(self, df):
        sentences = df['review_text'].values
        labels = np.zeros(len(sentences))
        
        # tokenizer
        tokens_seq = list(map(self.bert_tokenizer.tokenize, sentences)) # list of shape [sentence_len, token_len]
        
        # Get fixed-length sequence and its mask
        result = list(map(self.trunate_and_pad, tokens_seq))
        
        input_ids = [i[0] for i in result]
        attention_mask = [i[1] for i in result]
        token_type_ids = [i[2] for i in result]
        
        return (
               torch.Tensor(input_ids).type(torch.long), 
               torch.Tensor(attention_mask).type(torch.long),
               torch.Tensor(token_type_ids).type(torch.long), 
               torch.Tensor(labels).type(torch.long)
               )
    
    
    def trunate_and_pad(self, tokens_seq):
        
        # Concat '[CLS]' at the beginning
        tokens_seq = ['[CLS]'] + tokens_seq     
        # Truncate sequences of which the lengths exceed the max_seq_len
        if len(tokens_seq) > self.max_seq_len:
            tokens_seq = tokens_seq[0 : self.max_seq_len]           
        # Generate padding
        padding = [0] * (self.max_seq_len - len(tokens_seq))       
        # Convert tokens_seq to token_ids
        input_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens_seq)
        input_ids += padding   
        # Create attention_mask
        attention_mask = [1] * len(tokens_seq) + padding     
        # Create token_type_ids
        token_type_ids = [0] * (self.max_seq_len)
        
        assert len(input_ids) == self.max_seq_len
        assert len(attention_mask) == self.max_seq_len
        assert len(token_type_ids) == self.max_seq_len
        
        return input_ids, attention_mask, token_type_ids

In [93]:
test_data = DataPrecessForSentenceTest(tokenizer,test_data_source, max_seq_len = MAX_LEN)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

In [94]:
def test1(model, dataloader):
    """
    Test the accuracy of a model on some labelled test dataset.
    Args:
        model: The torch module on which testing must be performed.
        dataloader: A DataLoader object to iterate over some dataset.
    Returns:
        batch_time: The average time to predict the classes of a batch.
        total_time: The total time to process the whole dataset.
        accuracy: The accuracy of the model on the input data.
        all_prob: The probability of classification as label 1 on the entire validation set.
    """
    # Switch the model to eval mode.
    model.eval()
    device = model.device
    time_start = time.time()
    batch_time = 0.0
    accuracy = 0.0
    all_prob = []
    all_labels = []
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in dataloader:
            batch_start = time.time()
            # Move input and output data to the GPU if one is used.
            seqs, masks, segments, labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            _, _, probabilities = model(seqs, masks, segments, labels)
            accuracy += correct_predictions(probabilities, labels)
            batch_time += time.time() - batch_start
            all_prob.extend(probabilities.cpu().numpy())
            all_labels.extend(batch_labels)
    batch_time /= len(dataloader)
    total_time = time.time() - time_start
    accuracy /= (len(dataloader.dataset))

    return batch_time, total_time, accuracy, all_prob

In [95]:
_, _, test_accuracy, all_prob = test1(model, test_loader)

In [1]:
test_prediction = pd.DataFrame(all_prob,columns=columns_names)

NameError: ignored

In [2]:
my_prediction = pd.DataFrame({'score':test_prediction.apply(lambda x: columns_names.index(x.idxmax()) , axis=1)})

NameError: ignored

In [3]:
my_prediction.to_csv('MySubmission.csv',index=False)

NameError: ignored