# Setup

## Load and install packages

In [None]:
%%capture

# !pip install PyMuPDF
# !pip install icecream
# !pip install nltk
# !pip install ocrmypdf
# !pip install pytesseract
# !pip install pngquant
# !pip install fastparquet
# !pip install datasets
# !pip install fastparquet
# !pip install evaluate
# !pip install pycaret
# !pip install Umap
# !pip install git+https://github.com/huggingface/transformers.git
# !pip install git+https://github.com/huggingface/accelerate.git
# !pip install -U sentence-transformers


In [None]:
from tqdm.auto import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd 
import fitz 
from PIL import Image
import re
import pngquant
from unidecode import unidecode
import torch
import evaluate
from icecream import ic

%matplotlib inline

## PDF Helper Functions

In [None]:
def get_pdf(path):
    '''Reads a PDF file and returns a PdfDocument object.'''
    doc = fitz.open(path)
    return doc

def render(page):
    '''Render a page to a PIL image.'''
    bitmap = page.render(
        scale = 1,    # 72dpi resolution
        rotation = 0, # no additional rotation
    )
    return bitmap.to_pil()

# cleanup content using regex:
def clean_characters(text):
    text = unidecode(text) # convert to ascii
    text = re.sub(r"[^a-zA-Z0-9.£$€']+", ' ', text) # strategy 1: keep numbers
    # text = re.sub(r"[^a-zA-Z:.£$€'\n]+", ' ', text) ## strategy 2: remove numbers and %
    text = ' '.join([w for w in text.split() if len(w)>1] )
    return text

def extract_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2])
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the pdf
    page = doc.load_page(page_num) # select the page from the pdf
    content = page.get_text("text", sort = True, flags=fitz.TEXT_INHIBIT_SPACES) # extract the text from the page
    content = unidecode(content) # convert to ascii
    content = clean_characters(content) # replace unrecognized characters
    # should run OCR when needed, but too slow with this hardware, only OCR test-set
    return content

def extract_content_OCR_test_set(file_id: str) -> str :
    '''Extract content from PDF  using OCR when needed: slower than extract_content_from_id'''
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2])
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the pdf
    page = doc.load_page(page_num) # select the page from the pdf
    content = page.get_text("text", sort = True, flags=fitz.TEXT_INHIBIT_SPACES) # extract the text from the page
    content = unidecode(content) # convert to ascii
    content = clean_characters(content) # replace unrecognized characters
    
    # #split the string to test parsing: OCR when needed
    # words = content.split()
    # avg_char = sum(len(word) for word in words)/len(words)

    # # OCR if needed: check test below section OCR: too slow
    # if avg_char > 18:
    #     content = ocr_the_page(page)
        
    return content

def render_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2]) # reconstruct the report name
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the report pdf
    dpi = 150  # set the dpi
    mat = fitz.Matrix(dpi / 72, dpi / 72)  # sets zoom factor
    pix = doc[page_num].get_pixmap(matrix=mat)
    img_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img_page

In [None]:
# OCR Version 1: using PyMuPDF builtin ocr_the_page function

import ocrmypdf
import sys
import io

ocrmypdf.configure_logging(verbosity = -1)

def ocr_the_page(page):
    """Extract the text from passed-in PDF page."""    
    src = page.parent  # the page's document
    doc = fitz.open()  # make temporary 1-pager
    doc.insert_pdf(src, from_page=page.number, to_page=page.number)
    pdfbytes = doc.tobytes()
    inbytes = io.BytesIO(pdfbytes)  # transform to BytesIO object
    outbytes = io.BytesIO()  # let ocrmypdf store its result pdf here

    # run ocr and get result pdf as bytes
    ocrmypdf.ocr(
        inbytes,  # input 1-pager
        outbytes,  # ouput 1-pager
        language="eng",  # modify as required e.g. ("eng", "ger")
        output_type="pdf",  # only need simple PDF format
        # add more paramneters, e.g. to enforce OCR-ing, etc., e.g.
        force_ocr=True, 
        # redo_ocr=True
        progress_bar=False,
        optimize=1,
    )
    
    # read result pdf as fitz document
    ocr_pdf = fitz.open("pdf", outbytes.getvalue())  # read output as fitz PDF
    text = ocr_pdf[0].get_text()  # and extract text from the page
    return text  # return it

# build function to extract page from file_id then call function `ocr_the_page`
def extract_content_from_id_ocr(file_id: str) -> str :
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2])
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the pdf
    page = doc[page_num] # select the page from the pdf
    
    content = ocr_the_page(page)
    
    return content

In [None]:
# OCR Version 2: using PyMuPDF with Partial OCR
# doesn't work, require setting environment variable TESSDATA_PREFIX

# def extract_content_from_id_ocr_partial(file_id: str) -> str :    
#     # extract filename and page
#     items = file_id.split('.')
#     filename = '.'.join(items[:2])
#     page_num = int(items[-1])-1
    
#     # load pdf, select page
#     filepath = os.path.join(report_path, filename) #path to the report
#     doc = get_pdf(filepath) # load the pdf
#     page = doc[page_num] # select the page from the pdf
    
#     # extract content
#     partial_tp = page.get_textpage_ocr(flags=0, full=False)
#     content = page.get_text(textpage=partial_tp, sort=True)

#     return content

## Set path for folder

- choose one of the following blocks: online if run directly on Kaggle, local if data downloaded

In [None]:
%%script echo skip

# # Kaggle online path

# basepath = "/kaggle/input/oxml2023mlcases-esg-classifier"

# # build get path function, input folder name (as a list) and file name (optional), return the path
# def get_path(folder_name: list, file_name=None):
#     path = Path(basepath)
#     for folder in folder_name:
#         path = path / folder
#     if file_name:
#         path = path / file_name
#     return path


# # report path
# report_path = get_path(["data", "reports"])

# # label path
# label_path = get_path(["data"], "labels.csv")

# # output path
# output_path = '/kaggle/working/'

# submit_path = get_path([], "sample_submission.csv")


In [None]:
# local data path 

# current working directory, go up 1 level
basepath = Path.cwd().parents[0]

# build get path function, input folder name (as a list) and file name (optional), return the path
def get_path(folder_name: list, file_name=None):
    path = Path(basepath)
    for folder in folder_name:
        path = path / folder
    if file_name:
        path = path / file_name
    return path

# report path
report_path = get_path(["data", "reports"])

# label path
label_path = get_path(["data"], "labels.csv")

# output path
output_path = get_path([], "output")

# submit sample path
submit_path = get_path([], "sample_submission.csv")


In [None]:
print("Report path: ", report_path)
print("Label path: ", label_path)
print("Output path: ", output_path)
print("Submit path: ", submit_path)

## Test OCR

In [None]:
%%script echo skip

# load file id
file_id = 'report_1611.pdf.16'

# render the page
render_content_from_id(file_id)


In [None]:
%%script echo skip

# extract content from the page
test_no_ocr = extract_content_from_id(file_id)
test_with_ocr = extract_content_from_id_ocr(file_id)
# test_partial_ocr = extract_content_from_id_ocr_partial(file_id): not working, can't set ENV var

print(f'*** start testing:[{file_id}] *** \n')
print('\n *** no OCR *** \n')
print(test_no_ocr)
print('\n *** with OCR *** \n')
print(test_with_ocr)
# print('\n *** partial OCR *** \n')
# print(test_partial_ocr)
# print(f'*** end test *** \n\n')

# Parse PDF and process text

In [None]:
# %%script echo skipping

# read label file to get id and class
df = pd.read_csv(label_path)

# check duplicates id
print("Number of duplicated id: ", df.duplicated(subset=['id']).sum())

# slice duplicated id, all entries with duplicated id
df_duplicate = df[df.duplicated(subset=['id'], keep=False)].sort_values(by=['id'])
df_duplicate # duplicates have conflicting labels, due to nature of the pages

# drop the duplicates (keep the first entry)
df = df.drop_duplicates(subset=['id'], keep='first').sort_values(by=['id'])


In [None]:
# %%script echo skipping

# make wrapper function to extract content and save to df
def get_content(df):
    
    contents = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        # get file id
        file_id = row['id']
        # extract content from 
        content = extract_content_from_id(file_id)
        contents.append(content)
    
    df['content'] = contents
    
    return df

# apply function to parse PDF
df = get_content(df)

In [None]:
%%script echo skipping

# # make wrapper function to extract content and save to df
# def get_content_test_set(df):
    
#     contents = []
    
#     for idx, row in tqdm(df.iterrows(), total=len(df)):
#         # get file id
#         file_id = row['id']
#         # extract content from 
#         content = extract_content_from_id_test_set(file_id)
#         contents.append(content)
    
#     df['content'] = contents
    
#     return df

# # apply function to parse PDF
# df = get_content(df)

In [None]:
# %%script echo skipping

# create a new column with the report name
df['report_name'] = df['id'].apply(lambda x: x.split('.')[0])

# creat a new column with the page number
df['page_num'] = df['id'].apply(lambda x: x.split('.')[-1])

# reorder columns
df = df[['id','report_name', 'page_num', 'class', 'content']]

# dictionary mapping class label to numeric
map_label = {'other': 0, 'environmental': 1, 'social': 2, 'governance': 3}

# use the dictionary to replace the class label
df['target'] = df['class'].map(map_label)


In [None]:
# %%script echo skipping

# function to extract starting part and ending part of text content
def extract_block(text, num_token):
    # calculate the number of characters
    block_length = 5*num_token # assuming 1 token has 5 characters on average
    
    # extract the start, and end blocks
    start_block = text[:block_length]
    end_block = text[-block_length:]

    return start_block, end_block


# build wrapper function on a dataframe, using the extract_block function above
def make_short_content(df, num_token):
    
    # make one new columns: join start and end blocks
    df['short_content'] = ''

    # iterate over each row, extract blocks and assign to new columns
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        content = row['content']
        start_block, end_block = extract_block(content, num_token)
        
        # joint the start and end blocks
        short_content = '.'.join([start_block, end_block])
        
        # assign to the new column
        df.loc[idx, 'short_content'] = short_content
        
    return df

# transform content to shorten content
df = make_short_content(df,150)

In [None]:
# %%script echo skipping

# save local data file
path = Path(output_path)
file_data = path / 'data_clean.parquet'
print(file_data)

# save df to parquet
df.to_parquet(file_data, engine='fastparquet')


# Fine-tuning: start here

In [None]:
# Load clean data from local storage

# local data file
path = Path(output_path)
file_data = path / 'data_clean.parquet'
print(file_data)

# reload df from parquet
df = pd.read_parquet(file_data, engine='fastparquet')

## Split report

- objective: unique reports, pages, classes, all are roughly the same in both sets

In [None]:
# convert column page_num to int
df['page_num'] = df['page_num'].astype(int)

# get unique report names along with total number of pages
num_pages_df = df.groupby('report_name')[['page_num']].nunique()

# make a new column: value H for reports with more than 20 pages, L otherwise
median = num_pages_df['page_num'].median()
num_pages_df['pages'] = np.where(num_pages_df['page_num'] > median, 'H', 'L')

In [None]:
# get unique report names along average target value
target_df = df.groupby('report_name')[['target']].mean()

# discretize the avg target value to integer: try to distribute reports evenly among classes
# not really an issue since labels are distribured evenly among non-zero classes
bins = [0, 0.99, 1.5, 2,3] # boundaries for the bins
labels = [0, 1, 2, 3]   # labels for the bins
target_df['avg_target'] = pd.cut(x = target_df['target'], bins = bins, labels = labels, include_lowest = True)


In [None]:
# merge the two dataframes using report_name as key, create new index
df_report = pd.merge(num_pages_df, target_df, on='report_name').reset_index()

# when the column avg_target is not zero, joint the pages and avg_arget columns as string
df_report['split_label'] = np.where(df_report['avg_target'] == 0, 0, df_report['avg_target'].astype(str) + df_report['pages'])

# convert split_label to string
df_report['split_label'] = df_report['split_label'].astype(str)

# keep only report_name and split_label columns
df_report = df_report[['report_name','page_num','split_label']]

In [None]:
# import sklearn train test split stratify
from sklearn.model_selection import train_test_split

# use sklearn to split with stratify on target split column
# set test_size minimal when done fine-tuning: use max amount of data possible
train_report, test_report = train_test_split(df_report, test_size=0.15, stratify=df_report['split_label'], random_state=42)

# print total unique report name in each set
print("Total unique report name in train set: ", len(train_report['report_name'].unique()))
print("Total unique report name in test set: ", len(test_report['report_name'].unique()))

# print total pages in each set
print("Total pages in train set: ", train_report['page_num'].sum())
print("Total pages in test set: ", test_report['page_num'].sum())

## Prepare data

In [None]:
# slice the df to keep only the train report names: total pages will match
train_df = df[df['report_name'].isin(train_report['report_name'])]

# slice the df to keep only the test report names
test_df = df[df['report_name'].isin(test_report['report_name'])]

In [None]:
import datasets
from datasets import Dataset, DatasetDict

# slice short content and label from train_df, rename short_content to text
train_df = train_df[['id','short_content', 'target']]
train_df = train_df.rename(columns={'short_content': 'text', 'target': 'label'})

# convert to huggingface dataset
train = Dataset.from_pandas(train_df, preserve_index=False)

# get short content and label from test_df, rename short_content to text
test_df = test_df[['id','short_content', 'target']]
test_df = test_df.rename(columns={'short_content': 'text', 'target': 'label'})

# convert to huggingface dataset
validation = Dataset.from_pandas(test_df, preserve_index=False)


print(train)
print(validation)

## Set up pipeline

In [None]:
# list of pre-trained models
finbert_esg = 'yiyanghkust/finbert-esg'
sentence_roberta = 'sentence-transformers/all-distilroberta-v1'
fin_roberta = 'soleimanian/financial-roberta-large-sentiment'


In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification 
from transformers import TrainingArguments, Trainer , DataCollatorWithPadding
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import evaluate

# from transformers import logging
# logging.set_verbosity_error()

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# set up tokenizer
tokenizer = AutoTokenizer.from_pretrained(finbert_esg)

# tokenize function
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", max_length=256, truncation=True)

# map tokenize function to train and test dataset
tokenized_train_dataset = train.map(tokenize_function, batched=True)
tokenized_test_dataset = validation.map(tokenize_function, batched=True)

# define data collator (stick batches together), use later in trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(tokenized_train_dataset)
print(tokenized_test_dataset)

## Fine-tuning

In [None]:
# Fine-tuning if needed

# set up model
model = AutoModelForSequenceClassification.from_pretrained(finbert_esg, num_labels=4)


# set up training arguments : output directory, evaluation strategy
# # for Kaggle
# training_args = TrainingArguments(output_dir=output_path, 
#                                   evaluation_strategy="epoch",
#                                   save_strategy="epoch",
#                                   save_total_limit = 1,
#                                   overwrite_output_dir = True,
#                                   logging_steps=200,
#                                   learning_rate= 0.0001,
#                                   per_device_train_batch_size=16,
#                                   per_device_eval_batch_size=16,
#                                   num_train_epochs=3,
#                                   load_best_model_at_end=True,
#                                   weight_decay=0.01,
#                                   metric_for_best_model='eval_loss'
#                                  ) 

# for M1 Mac: doesn't work
training_args = TrainingArguments(output_dir=output_path, evaluation_strategy="epoch", use_mps_device=True)  

def compute_metrics(eval_preds):   # compute accuracy and f1-score
    f1_metric = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")
    return f1

# Trainer object has automated training loop: can write our custom training loop in PyTorch native
trainer = Trainer( 
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
# # %%script echo skip
trainer.train()  # starts fine-tuning

In [None]:
# os.makedirs('/kaggle/working/model_checkpoint')

trainer.save_model('/kaggle/working/model_checkpoint')

# Inference on test-set

## Get test data and process content

In [None]:
# read sample submission file
df_submit = pd.read_csv("/kaggle/input/oxml2023mlcases-esg-classifier/sample_submission.csv")

In [None]:
# get content from test set
df_submit = get_content_test_set(df_submit)

In [None]:
# get shorten content from full text content
df_submit = make_short_content(df_submit, 100)

In [None]:
# extract one ranndom row
sample_row = df_submit.sample(1).iloc[0]

# print content vs. short content
print(sample_row['content'])
print(f'\n\n\n')
print(sample_row['short_content'])

In [None]:
# keep short content 
df_submit = df_submit[['id','short_content']]

## Make prediction with fine-tuned model

In [None]:
# %%script echo skip

# # # get prediction : choose this one or below

# df_submit['class'] = ''

# # iteratively get prediction for each row in df_submit
# for index, row in df_submit.iterrows():
        
#         # extract content from row
#         text = row['short_content']
#         # tokenize content
#         encoded_input = tokenizer(text, padding="max_length", max_length=512, truncation=True, return_tensors='pt')
#         # get prediction
#         output = model(**encoded_input)
        
#         # print(output)
#         pred = int(torch.argmax(output.logits, axis=1).detach())
#         # print(f'Predicted label is: {pred}')
#         df_submit.iloc[index]['class'] = pred

    
# # dictionary mapping class label to numeric
# map_label = {'other': 0, 'environmental': 1, 'social': 2, 'governance': 3}

# # reverse the dictionary
# reverse_map_label = {v: k for k, v in map_label.items()}

# # make a copy of submission df
# submission = df_submit.copy()

# # drop column content
# submission.drop(columns=['short_content'], inplace=True)

# # map numeric label to class label
# submission['class'] = submission['class'].map(reverse_map_label)

# submission

In [None]:
# %%script echo skip 

# works faster, but harder to low-level control

# set up pipeline
clf = pipeline("text-classification",model=model,tokenizer=tokenizer,device=0)
tokenizer_kwargs = {'padding': 'max_length','truncation':True,'max_length':512}

# get prediction

df_submit['class'] = ''

# iteratively get prediction for each row in df_submit
for index, row in df_submit.iterrows():
        
        # extract content from row
        text = row['short_content']

        # get prediction
        pred = clf(text,**tokenizer_kwargs)
        # print(pred[0]['label'])
        
        # append prediction class label
        df_submit.iloc[index]['class'] = pred[0]['label']
        
# make a copy of submission df
submission = df_submit[['id','class']].copy()

# dictionary mapping class label to numeric
map_label = {'other': 'None', 'environmental': 'Environmental', 'social': 'Social', 'governance': 'Governance'}

# reverse the dictionary
reverse_map_label = {v: k for k, v in map_label.items()}

# map numeric label to class label
submission['class'] = submission['class'].map(reverse_map_label)

# display
submission

## Submit file

In [None]:
# convert to csv
submission.to_csv('/kaggle/working/submission.csv', index=False)

submission.to_csv('submission.csv', index=False)