# Setup

## Load and install packages

In [1]:
# !pip install PyMuPDF
# !pip install -U nltk
# !pip install ocrmypdf -U
# !brew install tesseract
# !pip install pytesseract
# !pip install pngquant
# !pip install pyarrow 
# !pip install fastparquet
# !brew install cmake
# !brew install pkg-config
# !pip install transformers
# !pip install datasets
# !pip install einops


In [2]:
# %%capture
# !pip install bertopic

In [3]:
from tqdm.auto import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd 
from icecream import ic
import fitz 
from PIL import Image
import re
import pngquant
from unidecode import unidecode 
from datasets import Dataset
import torch

from transformers import AutoTokenizer, AutoModel
from transformers import pipeline
import datasets
from datasets import Dataset, DatasetDict

from transformers import logging

logging.set_verbosity_error()

# from bertopic import BERTopic

%matplotlib inline

## PDF Helper Functions

In [4]:
def get_pdf(path):
    '''Reads a PDF file and returns a PdfDocument object.'''
    doc = fitz.open(path)
    return doc

def render(page):
    '''Render a page to a PIL image.'''
    bitmap = page.render(
        scale = 1,    # 72dpi resolution
        rotation = 0, # no additional rotation
    )
    return bitmap.to_pil()

# cleanup content using regex:
def clean_characters(text):
    text = unidecode(text) # convert to ascii
    text = re.sub(r"[^a-zA-Z0-9.£$€'\n]+", ' ', text) # strategy 1: keep numbers
    # text = re.sub(r"[^a-zA-Z:.£$€'\n]+", ' ', text) ## strategy 2: remove numbers and %
    clean_text = ' '.join( [w for w in text.split() if len(w)>1] )
    return text

def extract_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2])
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the pdf
    page = doc.load_page(page_num) # select the page from the pdf
    content = page.get_text("text", sort = True, flags=fitz.TEXT_INHIBIT_SPACES) # extract the text from the page
    content = clean_characters(content) # replace unrecognized characters
    return content

def render_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2]) # reconstruct the report name
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the report pdf
    dpi = 150  # set the dpi
    mat = fitz.Matrix(dpi / 72, dpi / 72)  # sets zoom factor
    pix = doc[page_num].get_pixmap(matrix=mat)
    img_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img_page

## Set path for folder

- choose one of the following blocks: online if run directly on Kaggle, local if data downloaded

In [6]:
# local data path using pathlib

basepath = '/Users/mbp14/Desktop/GoogleDrive/OxML_2023/code/Kaggle'

# build get path function, input folder name (as a list) and file name (optional), return the path
def get_path(folder_name: list, file_name=None):
    path = Path(basepath)
    for folder in folder_name:
        path = path / folder
    if file_name:
        path = path / file_name
    return path

# report path
report_path = get_path(["data", "reports"])

# label path
label_path = get_path(["data"], "labels.csv")

In [7]:

print("Report path: ", report_path)
print("Label path: ", label_path)

Report path:  /Users/mbp14/Desktop/GoogleDrive/OxML_2023/code/Kaggle/data/reports
Label path:  /Users/mbp14/Desktop/GoogleDrive/OxML_2023/code/Kaggle/data/labels.csv


# Load data

In [8]:
# # read label file to get id and class
# df = pd.read_csv(label_path)

# # check duplicates id
# print("Number of duplicated id: ", df.duplicated(subset=['id']).sum())

# # slice duplicated id, all entries with duplicated id
# df_duplicate = df[df.duplicated(subset=['id'], keep=False)].sort_values(by=['id'])
# df_duplicate # duplicates have conflicting labels, due to nature of the pages

# # drop the duplicates (keep the first entry)
# df = df.drop_duplicates(subset=['id'], keep='first').sort_values(by=['id'])

In [9]:

# # extract content from each page
# contents = []

# for idx, row in tqdm(df.iterrows(), total=len(df)):
#     # get file id
#     file_id = row['id']
#     # extract content from 
#     content = extract_content_from_id(file_id)
#     contents.append(content)
    
# df['content'] = contents

In [10]:

# # create a new column with the report name
# df['report_name'] = df['id'].apply(lambda x: x.split('.')[0])

# # creat a new column with the page number
# df['page_num'] = df['id'].apply(lambda x: x.split('.')[-1])

# # reorder columns
# df = df[['id','report_name', 'page_num', 'class', 'content']]


In [11]:
# # dictionary mapping class label to numeric
# map_label = {'other': 0, 'environmental': 1, 'social': 2, 'governance': 3}

# # use the dictionary to replace the class label
# df['target'] = df['class'].map(map_label)

# # save df to parquet
# df.to_parquet(get_path(["process"], "data_v2.parquet"))

In [12]:
# reload df from parquet
df = pd.read_parquet(get_path(["process"], "data_v2.parquet"))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/mbp14/Desktop/GoogleDrive/OxML_2023/code/Kaggle/process/data_v2.parquet'

# Feature extraction test

- test tokenizer
- test embedding

In [None]:
# make a sample of the df to test the model
df_sample = df.sample(100)

## Unit test

In [None]:
# list of models
finbert_esg = 'yiyanghkust/finbert-esg'
sentence_roberta = 'sentence-transformers/all-distilroberta-v1'
fin_roberta = 'soleimanian/financial-roberta-large-sentiment'
bart = 'facebook/bart-large'

In [None]:
dataset = Dataset.from_pandas(df_sample)
print(dataset)

In [None]:
# set model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(finbert_esg)
model = AutoModel.from_pretrained(finbert_esg)

# check content
test_content = dataset[0]['content']
print(f'content: {test_content} *** label: {dataset[0]["class"]}')

# test the tokenizer
tokens = tokenizer(test_content, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

# size of the tokens
tokens.keys()

In [None]:
# encode the tokenized sentence with the model
outputs = model(**tokens)

# content of the output
print(outputs.last_hidden_state.shape)

# check output keys
print(outputs.keys())

# last hidden state 
outputs.last_hidden_state

# # extract the logits as array
# logits = outputs.logits.detach().numpy()

# # check value of logits: only valid for classification models
# logits

In [None]:
# # extractor pipeline
# extract = pipeline("feature-extraction", model=model, tokenizer=tokenizer)

# # test the pipeline
# feature = extract(test_content, truncation=True, padding = 'max_lenght', return_tensors='pt', max_length=512)

# # Warnings: some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
# # because no fine-tuning yet

# # check size and shape of the feature
# print(feature.shape)
# feature

# extracted feature depends on the type of model (classifier), not just the task
# some models return logits

### Test example

In [None]:
%%script echo skip

from transformers import AutoTokenizer, AutoModel
import torch

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# model name
model_name = bart

#Sentences we want sentence embeddings for
sentences = test_content

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True )

#Tokenize sentences
encoded_input = tokenizer(sentences, padding="max_length", truncation=True, max_length=512, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input, output_hidden_states=True)


# #Perform pooling. In this case, mean pooling
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
%%script echo skip

# check model output
print(model_output.keys())

# check shape of the model output
hidden = model_output[0]

print('size of hidden layers:', len(hidden))

# convert hidden tuple to array and check shape: tuple (nubmer_layers, batch_size, sequence_length, hidden_size)
hidden_array = torch.stack(hidden)
print(hidden_array.shape)

# extract embeddings from hidden layer: last hidden state = last layer from hidden
emb_last_hidden_state = model_output['last_hidden_state']
emb_last_layer = model_output['hidden_states'][-1] # not always work

# check whether the last hidden state is the same as the last layer
emb_last_layer == emb_last_hidden_state

# check shape of the embeddings
print(emb_last_hidden_state.shape)

## Test model for embedding

- select a list of model
- extract embedding given one example
- check how different the embeddings are

In [None]:
# get example content
example_content = dataset[3]['content']

In [None]:
# concatenate all models to a list
models = [finbert_esg, sentence_roberta, fin_roberta, bart]

# create list of model names
model_names = ['finbert_esg', 'sentence_roberta', 'fin_roberta']

# zip to make a dictionary
model_dict = dict(zip(model_names, models))

# loop through the models, print name and value
for name, model in model_dict.items():
    print(name, model)

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(last_hidden, attention_mask):
     '''input: hidden state tensor (batch_size, sequence_length, hidden_size) and attention mask (batch_size, sequence_length) 
     output: the mean of the hidden state (batch_size, hidden_size)
     attention mask is used to ignore the padding tokens'''
     token_embeddings = last_hidden # hidden state tensor (batch_size, sequence_length, hidden_size)
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
     sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
     return sum_embeddings / sum_mask

In [None]:
%%script echo skip

# define extraction workflow

# set model
model = AutoModel.from_pretrained(model_name)
# set tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
#Tokenize text
encoded_input = tokenizer(example_content, padding="max_length", truncation=True, max_length=512, return_tensors='pt')
#Compute model output
with torch.no_grad():
    model_output = model(**encoded_input, output_hidden_states=True)

# extract embeddings: last hidden state, a tensor of shape (batch_size, sequence_length, hidden_size)
last_hidden = model_output['last_hidden_state']
attention_mask = encoded_input['attention_mask']

#Perform mean pooling, using attention mask to take into account padding
text_embeddings = mean_pooling(last_hidden,attention_mask)

In [None]:
# build function for feature extraction workflow
def test_feature_extraction(model_name, content):
    '''input: model name and content
    output: extracted feature'''
    # set model
    model = AutoModel.from_pretrained(model_name)
    # set tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    #Tokenize text
    encoded_input = tokenizer(content, padding="max_length", truncation=True, max_length=512, return_tensors='pt')
    #Compute model output
    with torch.no_grad():
        model_output = model(**encoded_input, output_hidden_states=True)
    # extract embeddings: last hidden state, a tensor of shape (batch_size, sequence_length, hidden_size)
    last_hidden = model_output['last_hidden_state']
    attention_mask = encoded_input['attention_mask']
    #Perform mean pooling, using attention mask to take into account padding
    text_embeddings = mean_pooling(last_hidden,attention_mask)
    return text_embeddings

In [None]:
%%script echo skip

# make empty dictionary for storing extracted features
extracted_features = {}

# loop through models in model_dict and extract features
for name, model in model_dict.items():
    # extract feature
    feature = test_feature_extraction(model, example_content)
    # add feature to dictionary
    extracted_features[name] = feature
        

In [None]:
%%script echo skip

# check extracted features
print(extracted_features.keys())

# loop through extracted features and print size of each feature
for name, feature in extracted_features.items():
    print(name, feature.shape)
    

In [None]:
# # loop through the extracted features and convert each feature to a numpy array
# for name, feature in extracted_features.items():
#     extracted_features[name] = feature.numpy().flatten()
#     print(name, type(feature))
#     print(name, feature.shape)


In [None]:

# # check the dataframe
# print(extracted_features['finbert_esg'].shape)

# # flatten() makes the whole array look like a list


## Batch test

Strategy 1: iterate df
- each row: get content, tokenize, embed
- save in dictionary

Strategy 2: ds and batch tokenizer - better for fine-tuning
- convert all df to ds
- tokenize in batch with ds
- use embedding with encoded_ds
- convert embedding back to df


In [None]:
# build function for feature extraction workflow
def feature_extraction(model_name, content):
    '''input: model name and content
    output: extracted feature'''
    # set model
    model = AutoModel.from_pretrained(model_name)
    # set tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    #Tokenize text
    encoded_input = tokenizer(content, padding="max_length", truncation=True, max_length=512, return_tensors='pt')
    #Compute model output
    with torch.no_grad():
        model_output = model(**encoded_input, output_hidden_states=True)
    # extract embeddings: last hidden state, a tensor of shape (batch_size, sequence_length, hidden_size)
    last_hidden = model_output['last_hidden_state']
    attention_mask = encoded_input['attention_mask']
    #Perform mean pooling, using attention mask to take into account padding
    text_embeddings = mean_pooling(last_hidden,attention_mask)
    return text_embeddings

In [None]:
# build iterator function to return content and id for every row
def content_iterator(df):
    '''input: dataframe
    output: content for every row'''
    for index, row in df.iterrows():
        yield row['content'], row['id']
        
# test the iterator function to print
for content,id in content_iterator(df_sample):
    print(f'file id: {id} \n*** content ***\n {content}')
    break

In [None]:
# make empty dictionary for storing extracted features
extracted_features = {}

# set model name
model_name = sentence_roberta

# use iterator function and extract function to extract feature for each row
for content,id in content_iterator(df_sample):
    feature = feature_extraction(model_name, content) # extract feature
    feature = feature.numpy().flatten() # convert feature tensor to numpy array
    extracted_features[id] = feature # append feature


In [None]:

# check shape of extracted features: 2-level dictionary (num_row, feature_size), each row is a feature
print(len(extracted_features))

# convert extracted features to a dataframe, each row is a row, each column is one dimension of the feature
df_features = pd.DataFrame(extracted_features).T

In [None]:
# make a copy of df feature
df_features_label = df_features.copy()

# use id column to fetch class from df_sample
df_features_label['class'] = df_sample.set_index('id')['class']


# Results

## Extract feature workflow

`Sentence_transformer`: easier and straightforward
- sentence_transformer is compatible with any model
- low level control: tokenizer, model output
- can extract hidden state from several layers, not just the last layer
    - extract last hidden state easily
    - extract several last hidden states, then average, before pooling to one array

Feature-extraction `pipeline`:
- extracted feature depends on the type of model (classifier), not just the task
    - some models return logits
- not much control over call of tokenizer
- better to do inference, not extraction

## Models

768-d embedding:
- FinBert-ESG
- sentence-roberta

1024-d embedding:
- financial Roberta
- BART

Model outputs are different, depends on model architecture
- in encoder-decoder: 'encoder_hidden_state'
- vanila attention: 'hidden_state'

Model encoded_input from tokenizer are different:
- some don't have 'token_type_ids'

! Not compatible with our extraction workflow:
- Flan-T5
