# Setup

## Load and install packages

In [1]:

# !pip install -U pypdfium2
# !pip install PyMuPDF
# !pip install -U nltk
# !pip install ocrmypdf -U
# !brew install tesseract
# !pip install pytesseract
# !pip install -U pngquant
# !pip install keras
# !pip install tensorflow

In [2]:
from tqdm.auto import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd 
import pypdfium2 as pdfium
from icecream import ic
import fitz 
from PIL import Image
import re


%matplotlib inline

## PDF Helper Functions

In [3]:
def get_pdf(path):
    '''Reads a PDF file and returns a PdfDocument object.'''
    doc = fitz.open(path)
    return doc

def render(page):
    '''Render a page to a PIL image.'''
    bitmap = page.render(
        scale = 1,    # 72dpi resolution
        rotation = 0, # no additional rotation
    )
    return bitmap.to_pil()

# text processing: 
# replace unrecognized characters with space using regex
# breaking hyperlinks
def replace_unrecognized_characters(text):
    text = re.sub(r"[^a-zA-Z0-9\n:.?!$€/']+", ' ', text)
    return text

def extract_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2])
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the pdf
    page = doc.load_page(page_num) # select the page from the pdf
    content = page.get_text() # extract the text from the page
    content = replace_unrecognized_characters(content) # replace unrecognized characters
    return content

def render_content_from_id(file_id: str) -> str :    
    # extract filename and page
    items = file_id.split('.')
    filename = '.'.join(items[:2]) # reconstruct the report name
    page_num = int(items[-1])-1
    
    # load pdf, select page, and extract its content
    filepath = os.path.join(report_path, filename) #path to the report
    doc = get_pdf(filepath) # load the report pdf
    dpi = 100   # set the dpi
    mat = fitz.Matrix(dpi / 72, dpi / 72)  # sets zoom factor
    pix = doc[page_num].get_pixmap(matrix=mat)
    img_page = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img_page

## Set path for folder

- choose one of the following blocks: online if run directly on Kaggle, local if data downloaded

In [5]:
# local data path using pathlib

# main folder
basepath = "/Users/macmini/Desktop/GoogleDrive/OxML_2023/code/Kaggle"

# build get path function, input folder name (as a list) and file name (optional), return the path
def get_path(folder_name: list, file_name=None):
    path = Path(basepath)
    for folder in folder_name:
        path = path / folder
    if file_name:
        path = path / file_name
    return path

# report path
report_path = get_path(["data", "reports"])

# label path
label_path = get_path(["data"], "labels.csv")


In [6]:

print("Report path: ", report_path)
print("Label path: ", label_path)

Report path:  /Users/macmini/Desktop/GoogleDrive/OxML_2023/code/Kaggle/data/reports
Label path:  /Users/macmini/Desktop/GoogleDrive/OxML_2023/code/Kaggle/data/labels.csv


In [7]:
# list of sample csv files
files = ['sample_few_token.csv', 'sample_long_token.csv', 'sample_short_token.csv']

# load sample file from csv
sample_few_token = pd.read_csv(get_path(["process"], files[0]), index_col=None)
sample_long_token = pd.read_csv(get_path(["process"], files[1]), index_col=None)
sample_short_token = pd.read_csv(get_path(["process"], files[2]), index_col=None)

## Insight for cleaning and tokenizer

- good text processing solves long tokens, OCR parsing it not needed
  - hyperlinks should work
  - footnotes should work

<!-- -->

- OCR may be useful for tables and charts
  - quite slow in costly
  - deploy unless really needed

# Improve tokenizers

## Regex cleaning
- clean text content before tokenizer

In [8]:
# unrecognized characters: replace with space using regex
# breaking hyperlinks: exclude / from text

def replace_unrecognized_characters(text):
    text = re.sub(r"[^a-zA-Z0-9\n;:,.?!]+", ' ', text)
    return text


## Word Tokenizer

In [9]:
# build text processing functions
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import re

In [10]:
# get one id from each sample df and concatenate them into a list
ids = [sample_few_token['id'][0], sample_long_token['id'][0], sample_short_token['id'][0]]

# test sentence tokenizer on each id
for id in ids:
    print("id: ", id)
    content = extract_content_from_id(id)
    sentences = sent_tokenize(content)
    # convert list to series
    sentences = pd.Series(sentences)
    # count the number of sentences in the content
    print("sentences *** \n ", sentences)
    print('\n *** \n')

# sentence tokenizer: can't deal with multiple linebreaks, tables


id:  report_1826.pdf.55
sentences *** 
  0    Further articles on the subject at:\naudi.com ...
dtype: object

 *** 

id:  report_1829.pdf.157
sentences *** 
  0     \nD Consolidated Financial Statements \n \n \...
1    r.l., Luxembourg \n100.0 \nAllianz Pension Con...
2    Invest \nKG, Berlin \n50.0 3 \nRoland Holding ...
3    r.l., \nCasablanca \n100.0 \nSeine GmbH, Munic...
dtype: object

 *** 

id:  report_1825.pdf.142
sentences *** 
  0    CORPORATE GOVERNANCE \n3.2 EXECUTIVE COMPENSAT...
1    b Employees other than corporate of icers at g...
2    In the table above all dates that are indicate...
3    140 \n \n \nI REGISTRATION DOCUMENT ANNUAL REP...
dtype: object

 *** 



In [11]:
# build a function to preprocess text: tokenize, remove stopwords and punctuation, lemmatize
lem = WordNetLemmatizer()

def tokenize_text(text):
  
  # Tokenize the text
  tokens = word_tokenize(text.lower())
  
  # filter stopwords and punctuation
  keep_tokens = [token for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]
  
  # Remove special characters
  tokens = [re.sub(r"[^a-zA-Z]+", '', token) for token in tokens]
  
  # Remove empty strings
  tokens = list(filter(None, tokens))

  # Lemmatize words
  tokens = [lem.lemmatize(token, "v") for token in tokens]
  
  # Remove token that are less than 1 character: cleaning up
  tokens = [token for token in tokens if len(token) > 1]

  return tokens

In [15]:
# pull all documents into a list
docs = [extract_content_from_id(id) for id in ids]

# tokenize each doc then concatenate them into a single list
tokens = [tokenize_text(doc) for doc in docs]
tokens = [item for sublist in tokens for item in sublist]


## Transformer Tokenizers

In [13]:
# from keras.preprocessing.text import Tokenizer

# # create the tokenizer
# tokenizer = Tokenizer()

# # fit the tokenizer on the documents
# tokenizer.fit_on_texts(docs)

# # summarize info on the tokenizer
# print("word counts: ", tokenizer.word_counts)
# print("document count: ", tokenizer.document_count)
# print("word index: ", tokenizer.word_index)
# print("word docs: ", tokenizer.word_docs)

