In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Code mixed from https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f and https://www.tensorflow.org/text/tutorials/classify_text_with_bert#define_your_model 

# Install Dependencies

In [2]:
!pip install transformers

from transformers import BertTokenizer
import torch
from torch import nn
from transformers import BertModel

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [3]:
# Get independent texts
my_file = open('/content/drive/MyDrive/oxml2023mlcases-esg-classifier/oxml_esg_texts.txt', "r")
data = my_file.read()

texts = data.split("unique_linebreak \n")
my_file.close()

# Get independent labels and encoding
df = pd.read_csv('/content/drive/MyDrive/oxml2023mlcases-esg-classifier/data/labels.csv')

labels = {
    'other': 0, 
    'environmental': 1, 
    'social': 2, 
    'governance': 3
}

# Add text column
df['text'] = texts[:-1]

In [4]:
df.head()

Unnamed: 0,id,class,text
0,report_519.pdf.53,governance,2021 ESG Impact Report ...
1,report_1537.pdf.24,social,24 2021 SUSTAINABILITY REPORT TABLE OF CONTENT...
2,report_567.pdf.33,governance,Sustainability Governance Clean Harbors’ commi...
3,report_1830.pdf.220,other,220 Report of the réviseur d’entreprises agréé...
4,report_1253.pdf.46,governance,Pfizer 2021 ESG Report 46 Governance Governanc...


# Build Dataset and Tokenize Text

In [5]:
np.random.seed(100)

In [6]:
# Probably change the model to bert-large-cased?
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

We can play around with encoding the whole text or just paragraphs at a time (aggregating/maxxing out over the per-paragraph predictions). 

In [7]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[l] for l in df['class']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y # -> a batch of tokenized texts and the corresponding labels

In [8]:
#Split the data into train and test
df_train, df_val = train_test_split(df, test_size=0.2)

print(len(df_train), len(df_val))

1564 392


# Build A BERT Classification Model
This is just layering a linear classifier on top of BERT, so we can grab its embedded class token and pass that through the classifier. 

In [9]:
# Alternative classifier idea
from transformers import BertForSequenceClassification, AdamW, BertConfig

class BertClassifier(nn.Module):
  def __init__(self, dropout=0.5):

    super(BertClassifier, self).__init__()

    self.bert = BertForSequenceClassification.from_pretrained('bert-large-cased') # -> again, we just need to change this to large model
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, 4) # -> input is a pooled 768-dim class embedding vector from transformer, and output is 4 classes
    self.relu = nn.ReLU()

  def forward(self, input_id, mask):
    # _ contains embedding vectors for all tokens in a sequence, and out contains the embedding vector of the class token for that sequence
    _, out = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False) 
    
    # the embedding vector for the class token gets passed through these layers for classification
    out = self.dropout(out)
    out = self.linear(out)
    out = self.relu(out)

    return out

In [25]:
# Original classifier idea 
class BertClassifier(nn.Module):
  def __init__(self, dropout=0.5):

    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-cased') # -> again, we just need to change this to large model
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, 4) # -> input is a pooled 768-dim class embedding vector from transformer, and output is 4 classes
    self.relu = nn.ReLU()

  def forward(self, input_id, mask):
    # _ contains embedding vectors for all tokens in a sequence, and out contains the embedding vector of the class token for that sequence
    _, out = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False) 
    
    # the embedding vector for the class token gets passed through these layers for classification
    out = self.dropout(out)
    out = self.linear(out)
    out = self.relu(out)

    return out

# Train the Classification Model

In [22]:
from torch.optim import Adam
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR

In [27]:
def train(model, train_data, val_data, learning_rate, epochs):
  
  # set up datasets
  train, val = Dataset(train_data), Dataset(val_data)

  # load the datasets
  train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
  val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

  # try for a GPU
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  # define loss and optimizer
  loss = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr=learning_rate)
  #scheduler = CosineAnnealingLR(optimizer,T_max = 32, eta_min = 1e-6) 
  
  if use_cuda:
    model = model.cuda()
    loss = loss.cuda()

  for epoch in range(epochs): 

    train_acc = 0
    train_loss = 0

    for train_input, train_label in tqdm(train_dataloader):
      
      # pass this stuff to the GPU
      train_label = train_label.to(device)
      mask = train_input['attention_mask'].to(device)
      input_id = train_input['input_ids'].squeeze(1).to(device)

      # feed data to model
      output = model(input_id, mask)

      # calculate loss
      batch_loss = loss(output, train_label.long())
      train_loss += batch_loss.item()

      # calculate accuracy -> likeliest label correct?
      acc = (output.argmax(dim=1) == train_label).sum().item()
      train_acc += acc

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    # lr scheduler
    #prior_lr = optimizer.param_groups[0]["lr"]
    #scheduler.step()
    #later_lr = optimizer.param_groups[0]["lr"]
    #print("Epoch %d: SGD lr %.4f -> %.4f" % (epoch, prior_lr, later_lr))

    val_acc = 0
    val_loss = 0

    # proper backprop for validation mode
    with torch.no_grad():

      for val_input, val_label in val_dataloader:

        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = loss(output, val_label.long())
        val_loss += batch_loss.item()

        acc = (output.argmax(dim=1) == val_label).sum().item()
        val_acc += acc

    print(
    f'Epochs: {epoch + 1} | Train Loss: {train_loss / len(train_data): .3f} \
    | Train Accuracy: {train_acc / len(train_data): .3f} \
    | Val Loss: {val_loss / len(val_data): .3f} \
    | Val Accuracy: {val_acc / len(val_data): .3f}')

In [28]:
model = BertClassifier()

epochs = 5
batch_size = 2
learning_rate = 1e-6
              
train(model, df_train, df_val, learning_rate, epochs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 782/782 [00:58<00:00, 13.47it/s]


Epochs: 1 | Train Loss:  0.612     | Train Accuracy:  0.464     | Val Loss:  0.550     | Val Accuracy:  0.548


100%|██████████| 782/782 [00:58<00:00, 13.47it/s]


Epochs: 2 | Train Loss:  0.443     | Train Accuracy:  0.651     | Val Loss:  0.397     | Val Accuracy:  0.719


100%|██████████| 782/782 [00:58<00:00, 13.44it/s]


Epochs: 3 | Train Loss:  0.298     | Train Accuracy:  0.829     | Val Loss:  0.261     | Val Accuracy:  0.865


100%|██████████| 782/782 [00:58<00:00, 13.46it/s]


Epochs: 4 | Train Loss:  0.197     | Train Accuracy:  0.916     | Val Loss:  0.207     | Val Accuracy:  0.890


100%|██████████| 782/782 [00:58<00:00, 13.43it/s]


Epochs: 5 | Train Loss:  0.140     | Train Accuracy:  0.948     | Val Loss:  0.161     | Val Accuracy:  0.941


# Evaluate on Test Data

In [None]:
def evaluate(model, test_data):

  test = Dataset(test_data)

  test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda: 
    model = model.cuda()

  test_acc = 0

  with torch.no_grad():
    
    for test_input, test_label in test_dataloader:
      test_label = test_label.to(device)
      mask = test_input['attention_mask'].to(device)
      input_id = test_input['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      acc = (output.argmax(dim=1) == test_label).sum().item()
      total_acc_test += acc

  print(f'Test Accuracy: {test_acc / len(test_data): .3f}')

In [None]:
# Run eval function 
evaluate(model, df_test)