In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# df = pd.read_csv('bib_data_union_v3.csv')
df = pd.read_csv('/content/drive/MyDrive/bib_data_union_v3.csv.zip',compression='zip')

In [None]:
model_name = "distilbert-base-uncased"
# model_name = "prajjwal1/bert-mini"
# model_name = "microsoft/deberta-base"


In [None]:
from sklearn.preprocessing import LabelEncoder
def encode(df):
          columnsToEncode = list(df[['style_name']])
          le = LabelEncoder()
          for feature in columnsToEncode:
              try:
                  df[feature] = le.fit_transform(df[feature])
              except:
                  print('Error encoding '+feature)
          return df


In [None]:
from sklearn.model_selection import train_test_split

data = encode(df)

train_data, val_data = train_test_split(data, test_size=0.2, random_state=2020)
train_data = train_data.reset_index()
val_data = val_data.reset_index()

eval_data, val_data = train_test_split(data, test_size=0.017, random_state=0)
eval_data = eval_data.reset_index()

In [None]:
new_tokens = ["upword", "capword", "othword", "caplet", "smallet", "year", "num", "sp"]
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(len(tokenizer))
tokenizer.add_tokens(new_tokens)
print(len(tokenizer))

30522
30528


In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        encoded_sent = tokenizer.encode_plus(
            text=self.data["tokenized_record"][idx],  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=150,                  # Max length to truncate/pad
#             pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation=True,
            padding="max_length"
            )


        return {"input_ids": torch.tensor(encoded_sent.get('input_ids')), "attention_mask": torch.tensor(encoded_sent.get('attention_mask')), "labels": torch.tensor(self.data["style_name"][idx])}

    def create_dataloader(self, batch_size=40, num_workers=1, shuffle=False):
        return DataLoader(
            self,
            batch_size=batch_size,
            num_workers=num_workers,
            shuffle=shuffle
        )

In [None]:
train_dataloader = MyDataset(train_data).create_dataloader(batch_size=600, shuffle=True, num_workers=1)
val_dataloader = MyDataset(val_data).create_dataloader(batch_size=600, num_workers=1)


In [None]:
val_dataloader2 = MyDataset(val_data[:20000]).create_dataloader(batch_size=600, num_workers=1)

In [None]:

import torch
import torch.nn as nn
from transformers import PreTrainedModel
import transformers

# Create the BertClassfier class
class DistilBertClassifier(PreTrainedModel):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, bert_config, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super().__init__(bert_config)
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 256, 50, 91
        # 64x128

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained(model_name)
        # self.bert = transformers.BertModel(bert_config)
        self.bert.resize_token_embeddings(len(tokenizer))


        # self.encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        # self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)

        # self.bert =

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
#             nn.Tanh(),
            nn.GELU(),
            nn.Linear(self.bert.config.hidden_size, 300),
            nn.GELU(),
            nn.Dropout(0.05),
            nn.Linear(300, 91)
        )

#         Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
#         outputs = self.bert(input_ids=input_ids)


        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA A100-SXM4-40GB


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoConfig

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler."""
    # Instantiate Bert Classifier
    bert_config = AutoConfig.from_pretrained(model_name)
    bert_config.update(
        {"hidden_dropout_prob": 0.1, "layer_norm_eps": 1e-7, "max_position_embeddings": 270, "hidden_size": 800, "intermediate_size": 500, "num_hidden_layers": 3,
         "initializer_range": 2}
        # {"hidden_dropout_prob": 0.05, "layer_norm_eps": 1e-7}

    )

    bert_classifier = DistilBertClassifier(bert_config, freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    lr = 2e-5
    optimizer = AdamW(
        bert_classifier.parameters(),
        lr=lr,  # Default learning rate
        eps=1e-8,  # Default epsilon value
    )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        pct_start=0.0,
        max_lr=lr,
        steps_per_epoch=len(train_dataloader),
        epochs=epochs,
        final_div_factor=1e4,
    )

    # scheduler = get_linear_schedule_with_warmup(optimizer,
    #                                             num_warmup_steps=0, # Default value
    #                                             num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler


In [None]:
import random
import time
from sklearn.metrics import f1_score

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=32):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False,accum_steps=3):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)

            # Zero out any previously calculated gradients
            #model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)

            batch_loss += loss.item()
            total_loss += loss.item()

            loss = loss / accum_steps

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters and the learning rate
            if (step + 1) % accum_steps == 0:
              torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
              optimizer.step()
              scheduler.step()
              # Zero out any previously calculated gradients
              model.zero_grad()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

            if (step % 500 == 0 and step != 0):
                val_loss, val_accuracy = evaluate(model, val_dataloader2)

                # Print performance over the entire training data
                time_elapsed = time.time() - t0_epoch

                print(f"{epoch_i + 1:^7} | {'-':^7} | ------------ | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
                print("-"*70)
                model.train()
                torch.save(model.state_dict(), f"drive/MyDrive/bibtex_checkpoints/{step // 500}.pt")

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
            torch.save(model.state_dict(), f"drive/MyDrive/bibtex_checkpoints/eval_checkpoint.pt")
        print("\n")

    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    truth = []
    predicted = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

        truth += list(b_labels.cpu())
        predicted += list(preds.cpu())

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    print("F1 macro:", f1_score(truth, predicted, average="macro"))

    return val_loss, val_accuracy

In [None]:
#SMALL DATASET - ATTEMPT 1
set_seed(69)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=10)
train(bert_classifier, train_dataloader, val_dataloader, epochs=10, evaluation=True, accum_steps=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------




   1    |   20    |   4.488133   |     -      |     -     |   34.24  
   1    |   40    |   4.387455   |     -      |     -     |   31.11  
   1    |   60    |   4.228094   |     -      |     -     |   31.12  
   1    |   80    |   4.060920   |     -      |     -     |   31.12  
   1    |   100   |   3.909374   |     -      |     -     |   31.12  
   1    |   120   |   3.759154   |     -      |     -     |   31.11  
   1    |   136   |   3.613258   |     -      |     -     |   23.64  
----------------------------------------------------------------------
F1 macro: 0.26044481575597794
   1    |    -    |   4.080021   |  3.492145  |   33.07   |  233.40  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   3.482789   |     -      |     -     |   33.15  
   2    |   40    |   3.331238   |     -      |     -  

In [None]:
# SMALL DATASET - ATTEMPT 2
set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=20)
train(bert_classifier, train_dataloader, val_dataloader, epochs=20, evaluation=True, accum_steps=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------




   1    |   20    |   4.496846   |     -      |     -     |   34.28  
   1    |   40    |   4.399951   |     -      |     -     |   31.10  
   1    |   60    |   4.227060   |     -      |     -     |   31.10  
   1    |   80    |   4.056752   |     -      |     -     |   31.13  
   1    |   100   |   3.892649   |     -      |     -     |   31.13  
   1    |   120   |   3.719579   |     -      |     -     |   31.12  
   1    |   136   |   3.588303   |     -      |     -     |   23.63  
----------------------------------------------------------------------
F1 macro: 0.3269372811127633
   1    |    -    |   4.071288   |  3.443251  |   40.70   |  233.16  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   3.437327   |     -      |     -     |   33.13  
   2    |   40    |   3.272891   |     -      |     -   

In [None]:
torch.save(bert_classifier.state_dict(), "drive/MyDrive/checkpoint_last_20epoch.pt")

In [None]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, val_dataloader, epochs=2, evaluation=True, accum_steps=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------




   1    |   20    |   4.485361   |     -      |     -     |   34.67  
   1    |   40    |   4.384121   |     -      |     -     |   31.13  
   1    |   60    |   4.241343   |     -      |     -     |   31.14  
   1    |   80    |   4.082496   |     -      |     -     |   31.15  
   1    |   100   |   3.935807   |     -      |     -     |   31.15  
   1    |   120   |   3.776402   |     -      |     -     |   31.15  
   1    |   140   |   3.624148   |     -      |     -     |   31.13  
   1    |   160   |   3.462833   |     -      |     -     |   31.13  
   1    |   180   |   3.306221   |     -      |     -     |   31.14  
   1    |   200   |   3.167774   |     -      |     -     |   31.17  
   1    |   220   |   3.021670   |     -      |     -     |   31.13  
   1    |   240   |   2.873523   |     -      |     -     |   31.14  
   1    |   260   |   2.723048   |     -      |     -     |   31.14  
   1    |   280   |   2.571031   |     -      |     -     |   31.14  
   1    |   300   | 

In [None]:
torch.save(bert_classifier.state_dict(), "drive/MyDrive/checkpoint_last_2epoch.pt")

In [None]:
train_dataloader = MyDataset(train_data).create_dataloader(batch_size=600, shuffle=True, num_workers=1)
eval_dataloader = MyDataset(eval_data).create_dataloader(batch_size=2000, num_workers=1)
import torch.nn.functional as F
from tqdm import tqdm

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in tqdm(test_dataloader):
      b_input_ids, b_attn_mask, b_labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)

        # Compute logits
      with torch.no_grad():
          logits = model(b_input_ids,b_attn_mask)
      all_logits.append(logits)

    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs



In [None]:
device = torch.device("cuda")
bert_classifier, o, s = initialize_model(epochs=20)
bert_classifier.load_state_dict(torch.load("drive/MyDrive/checkpoint_last_20epoch.pt"))
bert_classifier.to(device)
probs = bert_predict(bert_classifier, eval_dataloader)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2953/2953 [1:22:12<00:00,  1.67s/it]


In [None]:
preds = torch.argmax(torch.Tensor(probs), dim=1).flatten()

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, balanced_accuracy_score, zero_one_loss
y_val = eval_data['style_name']
accuracy_score(y_val, preds)

tensor([48,  5, 72,  ..., 89, 72, 19])


0.7407493268366002

In [None]:
f1_score(y_val, preds.cpu(), average='macro')

0.69004770370113

In [None]:
precision_score(y_val, preds.cpu(), average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.7257305957413948

In [None]:
recall_score(y_val, preds.cpu(), average='macro')

0.7067242121336447