In [32]:
# from google.colab import drive
# drive.mount('/content/drive')

Code mixed from https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f and https://www.tensorflow.org/text/tutorials/classify_text_with_bert#define_your_model

# Install Dependencies

In [33]:
#!pip install transformers

from transformers import BertTokenizer
import torch
from torch import nn
from transformers import BertModel

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [34]:
np.random.seed(123)

In [35]:
# Turn texts to strings of tokens
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

# Build A Dataset: OxML Data and Environmental Data

We can play around with encoding the whole text or just paragraphs at a time (aggregating/maxxing out over the per-paragraph predictions). The code for getting the env_lis_2 data is in the Sweep version of this notebook.

In [36]:
# Get OxML texts
my_file = open('oxml_esg_texts.txt', "r")
data = my_file.read()
texts = data.split("unique_linebreak \n")
my_file.close()

# Get ESG labels
df = pd.read_csv('oxml2023mlcases-esg-classifier/data/labels.csv')
print("length of OG train")
print(len(df))

labels = {
    'governance': 0,
    'social': 1,
    'environmental': 2,
    'other': 3
}

# Add text column to label dataframe
df['text'] = texts[:-1]

''' -------------------------------------------------------------------- '''



''' -------------------------------------------------------------------- '''

# Get extra environmental texts
my_file = open('Data/HF_Chatbot_Environmental_6.15.23.txt', "r")
data = my_file.read()
env_texts = data.split("\n")
my_file.close()

# Create environmental labels for extra texts
env_labels = ['environmental'] * len(env_texts)

# Add extra environmental texts to working dataframe
for i, j in zip(env_texts, env_labels):
    df = df.append({'text': i, 'class': j}, ignore_index=True)

''' -------------------------------------------------------------------- '''

# Get extra social texts
my_file = open('Data/HF_Chatbot_Social_6.14.23.txt', "r")
data = my_file.read()
social_texts = data.split("\n")
my_file.close()

# Create social labels for extra texts
social_labels = ['social'] * len(social_texts)

# Add extra social texts to working dataframe
for i, j in zip(social_texts, social_labels):
    df = df.append({'text': i, 'class': j}, ignore_index=True)

''' -------------------------------------------------------------------- '''

# Get extra other texts
my_file = open('Data/HF_Chatbot_Other.txt', "r")
data = my_file.read()
other_texts = data.split("\n")
my_file.close()

# Create social labels for extra texts
other_labels = ['other'] * len(other_texts)

# Add extra social texts to working dataframe
for i, j in zip(other_texts, other_labels):
    df = df.append({'text': i, 'class': j}, ignore_index=True)

''' -------------------------------------------------------------------- '''

# Get extra governance texts
my_file = open('Data/HF_Chatbot_Governance.txt', "r")
data = my_file.read()
gov_texts = data.split("\n")
my_file.close()

# Create social labels for extra texts
gov_labels = ['governance'] * len(gov_texts)

# Add extra social texts to working dataframe
for i, j in zip(gov_texts, gov_labels):
    df = df.append({'text': i, 'class': j}, ignore_index=True)
print("new length")
print(len(df))

length of OG train
1956
new length
2472


In [37]:
#training cleanup

In [38]:
df['text'] = df['text'].replace(r'\n','', regex=True) 

In [39]:
df[2000:2020]

Unnamed: 0,id,class,text
2000,,environmental,"On our quest for excellence, we engage experie..."
2001,,environmental,"Anticipating future regulations, we continuous..."
2002,,environmental,Investing in emerging technologies enables us ...
2003,,environmental,Sharing transparent information regarding expe...
2004,,environmental,Dedicated R&D programs drive incremental impro...
2005,,environmental,Collaborative initiatives like exchanging best...
2006,,environmental,Regular independent verification of emission p...
2007,,environmental,
2008,,environmental,Mining plays a crucial role in the world econo...
2009,,environmental,"However, mining activities have significant im..."


In [40]:
print(len(df))

2472


In [41]:
#drop empties
df['text'].replace('', np.nan, inplace=True)
df.dropna(subset=['text'], inplace=True)

In [42]:
print(len(df))

2415


In [43]:
print("The dataframe has {} columns".format(len(df)))

The dataframe has 2415 columns


In [44]:
df.to_csv('training_clean.csv')

In [None]:
#import new training data


In [45]:
# could change max length back to 512

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[l] for l in df['class']]
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 64, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y # -> a batch of tokenized texts and the corresponding labels

In [46]:
#remove stop words from content
from nltk.corpus import stopwords

final_stopwords_list = stopwords.words('english') + stopwords.words('french')
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (final_stopwords_list)]))


In [47]:
df.head(1)

Unnamed: 0,id,class,text
0,report_519.pdf.53,governance,"2021 ESG Impact Report Data privacy, security,..."


In [48]:
#rename columns 
# df = df[['id','class','content_without_stopwords']]
# df = df.rename(columns={'content_without_stopwords': 'text'})


In [49]:
#Split the data into train and test
df_train, df_val = train_test_split(df, test_size=0.25, shuffle=True)

print(len(df_train), len(df_val))

1811 604


In [25]:
# from sklearn import preprocessing
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split

In [26]:
# # Use TF-IDF to vectorize each document
# vectorizer = TfidfVectorizer(max_df=0.8,
#   max_features=200000,
#   min_df=0.2,
#   stop_words=final_stopwords_list,
#   use_idf=True,
#   #tokenizer=tokenize_and_stem,
#   ngram_range=(1,3))
# df_train = vectorizer.fit_transform([x for x in df_train])
# df_val = vectorizer.transform([x for x in df_val])

# Build A BERT Classification Model
This is just layering a linear classifier on top of BERT, so we can grab its embedded class token and pass that through the classifier.

In [50]:
# Original classifier idea
class BertClassifier(nn.Module):
  def __init__(self, dropout=0.5):

    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, 4) # -> input is a pooled 768-dim class embedding vector from transformer, and output is 4 classes
    self.relu = nn.ReLU()

  def forward(self, input_id, mask):
    # _ contains embedding vectors for all tokens in a sequence, and out contains the embedding vector of the class token for that sequence
    _, out = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)

    # the embedding vector for the class token gets passed through these layers for classification
    out = self.dropout(out)
    out = self.linear(out)
    out = self.relu(out)

    return out

# Train the Classification Model

In [51]:
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import f1_score

In [52]:
def get_f1(labels, preds):
  avg = []

  for i, j in zip(labels, preds):
    f1 = f1_score(i, j, labels=np.unique(j), average='macro')
    avg.append(f1)

  total_f1 = sum(avg)/len(avg)

  return total_f1

In [53]:
def train(model, train_data, val_data, learning_rate, epochs):

  # set up datasets
  train, val = Dataset(train_data), Dataset(val_data)

  # load the datasets
  train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
  val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

  # try for a GPU
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  # define loss and optimizer
  loss = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr=learning_rate)

  avg_train_f1 = []
  avg_val_f1 = []

  if use_cuda:
    model = model.cuda()
    loss = loss.cuda()

  for epoch in range(epochs):

    train_outputs = []
    train_labels = []
    val_outputs = []
    val_labels = []

    train_acc = 0
    train_loss = 0

    for train_input, train_label in tqdm(train_dataloader):

      # pass this stuff to the GPU
      train_label = train_label.to(device)
      train_labels.append(train_label.cpu().numpy())
      mask = train_input['attention_mask'].to(device)
      input_id = train_input['input_ids'].squeeze(1).to(device)

      # feed data to model
      output = model(input_id, mask)
      train_outputs.append(output.argmax(dim=1).cpu().numpy())

      # calculate loss
      batch_loss = loss(output, train_label.long())
      train_loss += batch_loss.item()

      # calculate accuracy -> likeliest label correct?
      acc = (output.argmax(dim=1) == train_label).sum().item()
      train_acc += acc

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    val_acc = 0
    val_loss = 0

    # proper backprop for validation mode
    with torch.no_grad():

      for val_input, val_label in val_dataloader:

        val_label = val_label.to(device)
        val_labels.append(val_label.cpu().numpy())
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)
        val_outputs.append(output.argmax(dim=1).cpu().numpy())

        batch_loss = loss(output, val_label.long())
        val_loss += batch_loss.item()

        acc = (output.argmax(dim=1) == val_label).sum().item()
        val_acc += acc

    avg_train_f1.append(get_f1(train_labels, train_outputs))
    avg_val_f1.append(get_f1(val_labels, val_outputs))

  print(
  f'Epochs: {epoch + 1} | Train Loss: {train_loss / len(train_data): .3f} \
  | Train Accuracy: {train_acc / len(train_data): .3f} \
  | Train F1: {avg_train_f1[-1]: .3f} \
  | Val Loss: {val_loss / len(val_data): .3f} \
  | Val Accuracy: {val_acc / len(val_data): .3f} \
  | Val F1: {avg_val_f1[-1]: .3f}')

In [54]:
model = BertClassifier()

epochs = 10
batch_size = 4
learning_rate = 1e-5

train(model, df_train, df_val, learning_rate, epochs)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████| 453/453 [09:15<00:00,  1.23s/it]
100%|█████████████████████████████████████████| 453/453 [08:34<00:00,  1.14s/it]
100%|████████████████████████

Epochs: 10 | Train Loss:  0.020   | Train Accuracy:  0.972   | Train F1:  0.970   | Val Loss:  0.123   | Val Accuracy:  0.874   | Val F1:  0.880


In [55]:
##Save a model
PATH = 'esg_transformer_BL3.pth'
torch.save(model, PATH)

#Save a model's parameters
PATH = 'esg_transformer_BL_params3.pth'
torch.save(model.state_dict(), PATH)

# Inference

In [56]:
#!pip install PyMuPDF

from pathlib import Path
import re
import fitz
import pandas as pd
from PIL import Image
423Bml
import torch

#device = torch.device("cuda"if torch.cuda.is_available() else"cpu")



In [57]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [58]:
# directories & files
import os
from pathlib import Path 

DIR_DATA = Path("oxml2023mlcases-esg-classifier/data/")
REPORTS_DIR = "reports/"
LABELS_FILE = "labels.csv"

# columns
C_ID, C_CLASS = "id", "class"

In [59]:
submission = pd.read_csv("oxml2023mlcases-esg-classifier/sample_submission.csv")

In [60]:
def create_filepath(filename):
    return DIR_DATA / REPORTS_DIR / filename

def read_page(filename, page_number):
    filepath = create_filepath(filename)
    doc = fitz.open(filepath)
    page_index = page_number - 1
    page = doc.load_page(page_index)
    return page.get_text()


def visualize_pdf_image(filename, page_number):
    doc = fitz.open(create_filepath(filename))
    page_index = page_number - 1
    pix = doc[page_index].get_pixmap()
    img_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img_page

def get_labels():
    return pd.read_csv(DIR_DATA / LABELS_FILE)

def build_doc_id(filename, page_number):
    return filename + "." + str(page_number)

In [61]:
# visualize a page and it's class

filename = "report_1132.pdf"
page_number = 9

labels = get_labels()
doc_id = build_doc_id(filename, page_number)
label = labels.loc[labels[C_ID] == doc_id, C_CLASS].values[0]
print(f"{C_CLASS}: {label}")
#visualize_pdf_image(filename, page_number)

class: environmental


In [62]:
#Load a previously saved model

PATH = 'esg_transformer_BL3.pth'
model = torch.load(PATH)

model.eval() # -> sets the model to evaluation mode (turns off batch norm and dropout layers, which are only used for training)

#Load previously saved model parameters
PATH = 'esg_transformer_BL_params3.pth'
#-> need to define a model first
#model = BigramLanguageModel(*args, **kwargs) 
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [63]:
import re
import fitz
for i in range(len(submission)):
#     print(i)
    path = submission.iloc[i][C_ID]
#     print(path)
    matches = re.match(r'^(.+)\.(\d+)$', path)
    filename = matches.group(1)
    page_number = int(matches.group(2))

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    content = read_page(filename, page_number)
    input_text = tokenizer(content, padding='max_length', max_length = 64, truncation=True, return_tensors="pt")

    if use_cuda:
      model = model.cuda()

    with torch.no_grad():

      mask = input_text['attention_mask'].to(device)
      input_id = input_text['input_ids'].squeeze(1).to(device)

      output = model(input_id, mask)

      pred = output.argmax(dim=1)

      submission.iloc[i]['class'] = int(pred.detach())

In [64]:
label_mapping = {0: 'governance', 1: 'social', 2: 'environmental', 3: 'other'}
submission['class'] = submission['class'].map(label_mapping)

In [65]:
submission.head(20)

Unnamed: 0,id,class
0,report_1352.pdf.49,environmental
1,report_1835.pdf.143,other
2,report_1352.pdf.46,environmental
3,report_1179.pdf.47,governance
4,report_607.pdf.12,environmental
5,report_1179.pdf.53,environmental
6,report_607.pdf.7,social
7,report_1012.pdf.70,environmental
8,report_1179.pdf.38,social
9,report_576.pdf.11,social


In [59]:
visualize_pdf_image('report_1835.pdf', 7)

In [66]:
# Converting submission to CSV

# submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.to_csv('BERT_new_training_submission.csv', index=False)

In [69]:
#xgboost
df.head(1)

Unnamed: 0,id,class,text
0,report_519.pdf.53,governance,"2021 ESG Impact Report Data privacy, security,..."


In [71]:
# Use TF-IDF to vectorize each document
vectorizer = TfidfVectorizer(max_df=0.8,
  max_features=200000,
  min_df=0.2,
  stop_words=final_stopwords_list,
  use_idf=True,
  #tokenizer=tokenize_and_stem,
  ngram_range=(1,3))
X_train_features = vectorizer.fit_transform([x for x in X_train])
X_test_features = vectorizer.transform([x for x in X_test])

In [74]:
# Encode the label classes
label_encoder = preprocessing.LabelEncoder()
df['label'] = label_encoder.fit_transform(df['class'])
print(label_encoder.classes_)

['environmental' 'governance' 'other' 'social']


In [75]:
# Split Train/Test set with 80:20
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=.2, shuffle=True)

In [95]:
# fit model no training data
#model = XGBClassifier()
import xgboost as xgb
xgb_cl3 = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=500,
    max_depth=4,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'multi:softmax',
    num_class=4,
    nthread=4,
    scale_pos_wieght=1,
    seed=123)

xgb_cl3.fit(X_train_features, y_train)
print(xgb_cl3)


Parameters: { "scale_pos_wieght" } are not used.

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, nthread=4, num_class=4,
              num_parallel_tree=None, ...)


In [96]:
# make predictions for test data
y_pred = xgb_cl3.predict(X_test_features)
predictions = [round(value) for value in y_pred]

In [97]:
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'accuracy: {acc}')
print(f'f1 score: {f1}')

accuracy: 0.2898550724637681
f1 score: 0.18972235466154025


In [81]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 27.95%


In [82]:
test_df = pd.read_csv('oxml2023mlcases-esg-classifier/sample_submission.csv')

In [84]:
def get_pdf(path):
    return pdfium.PdfDocument(path)

def get_content(page):
    textpage = page.get_textpage()
    return textpage.get_text_range()

def render(page):
    bitmap = page.render(
        scale = 1,    # 72dpi resolution
        rotation = 0, # no additional rotation
    )
    return bitmap.to_pil()

def extract_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2])
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename)
    pdf = get_pdf(filepath)
    page = pdf[page_num]
    content = get_content(page)

    content = " ".join(content.lower().split())
    return content

In [85]:
# Read PDF file based on file_id
test_contents = test_df['id'].map(lambda file_id: extract_content_from_id(file_id))