In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import time
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_data = pd.read_csv('./train.csv')
dev_data = pd.read_csv('./dev.csv')


In [2]:
# Create a dataset by pytorch
class NLIDataset(Dataset):
    def __init__(self, premises, hypotheses, labels, tokenizer, max_len):
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, item):
        premise = str(self.premises[item])
        hypothesis = str(self.hypotheses[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            return_overflowing_tokens=False 
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [3]:
# preprocess data
max_len = 128
train_dataset = NLIDataset(train_data['premise'], train_data['hypothesis'], train_data['label'], tokenizer, max_len)
dev_dataset = NLIDataset(dev_data['premise'], dev_data['hypothesis'], dev_data['label'], tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)


In [4]:
# load bert model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
optimizer = AdamW(model.parameters(), lr=2e-5)

# Using data loader and optimizer to train the model and using attention mask to pad the blank
def train_model(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        attention_mask = batch['attention_mask'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        labels = batch['labels'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # get loss for each batch
        loss = outputs.loss
        total_loss += loss.item()
        #update and initialize gradient
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Training loss: {total_loss / len(data_loader)}")

# train the model and calculate the duration
start=time.time()
train_model(model, train_loader, optimizer)
end=time.time()
excecution_time=round(end-start, 3)
print(f'The training time is {excecution_time}s.')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens 

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Training loss: 0.5107185827997334
The training time is 456.164s.


In [5]:
torch.save(model, './bert_model.pth')

In [6]:
# Using dev to evaluate model
def evaluate_model(model, data_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            attention_mask = batch['attention_mask'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            #get output by input index of token to model
            outputs = model(input_ids, attention_mask=attention_mask)
            # Obtain the predicted class by finding the maximum logit value
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())
    #calculate different value to evalue model
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

evaluate_model(model, dev_loader)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Accuracy: 0.8208401365592994, Precision: 0.8368436665677841, Recall: 0.8110983323749281, F1 Score: 0.8237698934150971
