In [18]:
from googlesearch import search
import re
import time
import urllib

import requests
from bs4 import BeautifulSoup

import io

from PyPDF2 import utils
import datetime as dt
from time import mktime, strptime

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

from io import StringIO

### Description

This notebook contains a demonstration of a proposed flow of alerting Banktrack of new PDF policies.

The flow goes like:
1. Scrape banktrack bank page for all stored pdf links
2. Use the scraped pdfs to key word google

In [19]:
def convert_pdf_to_txt(pdf, pages=None):
    """"""
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    r = requests.get(pdf)
    f = io.BytesIO(r.content)

    for page in PDFPage.get_pages(f, pagenums):
        interpreter.process_page(page)
    converter.close()
    text = output.getvalue()
    output.close()
    return text


def scrape_banktrack_links():
    """"""
    # get the url from requests get method
    read = requests.get(BANKTRACK_URL)

    # full html content
    html_content = read.content

    # Parse the html content
    soup = BeautifulSoup(html_content, "html.parser")

    # created an empty list for putting the pdfs
    list_of_pdf = set()

    # accessed the first p tag in the html
    l = soup.find("div", {"class": "policyfiles"})

    # accessed all the anchors tag from given p tag
    p = l.find_all(href=True)

    banktrack_links = []
    for a in p:
        banktrack_links.append(a['href'])
        
    return banktrack_links


def categorise_links(links):
    """"""
    dct_link_types = {'banktrack':
                          {'pdf': [],
                           'html': []},
                      'bank':
                          {'pdf': [],
                           'html': []}}

    for link in links:
        time.sleep(.5)
        try:
            r = requests.get(link,  timeout=5)
        except requests.exceptions.ConnectTimeout as e:
            print("timeout")
            continue

        domain = urllib.parse.urlparse(link).netloc
        content_type = r.headers.get('content-type')

        if 'application/pdf' in content_type:
            if domain == 'www.banktrack.org':
                dct_link_types['banktrack']['pdf'].append(link)
            else:
                dct_link_types['bank']['pdf'].append(link)

        elif 'text/html' in content_type:
            if domain == 'www.banktrack.org':
                dct_link_types['banktrack']['html'].append(link)
            else:
                dct_link_types['bank']['html'].append(link)
        else:
            ext = ''
            print('Unknown type: {}'.format(content_type))
            
    return dct_link_types


def get_pdf_meta(lst_pdf_links):
    """"""
    dct_pdf_meta = {}

    for pdf in lst_pdf_links:
        try:
            txt = convert_pdf_to_txt(pdf, pages=[0])
        except utils.PdfReadError as e:
            print(f"can't read {pdf}")

        r = requests.get(pdf)
        f = io.BytesIO(r.content)

        parser = PDFParser(f)
        doc = PDFDocument(parser)
        creation_date = doc.info[0]["CreationDate"].decode("utf-8")

        creation_date = creation_date.rstrip("Z").split("+")[0].split("-")[
            0].rstrip("Z'")
        creation_date = str(dt.datetime.fromtimestamp(mktime(strptime(
            creation_date[2:], "%Y%m%d%H%M%S"))))

        sample_txt = txt.replace("\n", '').strip(" ")[0:50]

        dct_pdf_meta[sample_txt] = {}
        dct_pdf_meta[sample_txt]['url'] = pdf
        dct_pdf_meta[sample_txt]['created_at'] = creation_date

    return dct_pdf_meta

def scrape_search_links(bank_track_links):
    """"""

    all_search_pdf_links = []

    for pdf in bank_track_links:

        pdf_name = pdf.split("/")[-1]
        pdf_name = re.sub(r'\d+', '', pdf_name)
        pdf_name = pdf_name.replace("_", " ")

        query = f"site:{BANK_DOMAIN} {pdf_name}"
        print(f"searching for {query}")

        # to search
        search_results = search(query, tld="co.in", num=3, stop=3,
                                pause=2)

        pdf_search_links = list(filter(lambda x: x.endswith('.pdf'),
                                       search_results))

        all_search_pdf_links.extend(pdf_search_links)
        
    return all_search_pdf_links


def get_new_pdfs(banktrack_pdf_meta, search_pdf_meta):

    doc_ints = set(search_pdf_meta.keys()).\
        intersection(set(banktrack_pdf_meta.keys()))

    print(f"There are {len(banktrack_pdf_meta)} banktrack pdfs")
    print(f"There are {len(search_pdf_meta)} pdfs found from search")
    print(f"The intersection of these is {len(doc_ints)} pdfs")

    non_matching_pdfs = set(search_pdf_meta.keys()) - set(
        banktrack_pdf_meta.keys())

    dct_non_matching_pdfs = {k: search_pdf_meta[k] for k in
                            non_matching_pdfs}

    sorted_by_date = reversed(sorted(dct_non_matching_pdfs, key=lambda x:
    dct_non_matching_pdfs[x][
        'created_at']))

    dct_non_matching_pdfs = {k: search_pdf_meta[k] for k in
                             sorted_by_date}
    
    return dct_non_matching_pdfs

In [42]:
BANKTRACK_URL = "https://www.banktrack.org/bank/barclays#policies"
BANK_DOMAIN = 'home.barclays'

In [29]:
# get all policy links from the banktrack website
bt_links = scrape_banktrack_links()

In [30]:
# categorise links into 4 categories:
# 1. PDFs stored on Banktrack
# 2. PDFs stored on Bank's website
# 3. HTML links to Banktrack
# 4. HTML links to Banktrack
dct_bt_links = categorise_links(bt_links)

# collect all pdf links
banktrack_pdf_links = (dct_bt_links['banktrack']['pdf'] + 
                       dct_bt_links['bank']['pdf'])

timeout


In [31]:
print(f"Scraped links for {len(banktrack_pdf_links)} pdfs")

Scraped links for 30 pdfs


In [1]:
# use the keywords found in each scraped link as a search term 
# and collect the top 3 pdfs from each search
search_pdf_links = scrape_search_links(banktrack_pdf_links)

In [39]:
# get pdf metadata for the pdf from banktrack policy page and from the google site searches 
banktrack_pdf_meta = get_pdf_meta(banktrack_pdf_links)
search_pdf_meta = get_pdf_meta(search_pdf_links)

In [40]:
# get the pdfs that were found in search but are not on banktrack
dct_new_pdfs = get_new_pdfs(banktrack_pdf_meta, search_pdf_meta)

There are 30 banktrack pdfs
There are 0 pdfs found from search
The intersection of these is 0 pdfs


In [27]:
for txt, meta in dct_new_pdfs.items():

    print(f"{txt} - Created on: {meta['created_at']}; "
          f"URL - {meta['url']}") 

CIBC Code of ConductNovember 2021CIBC External App - Created on: 2021-10-22 15:26:27; URL - https://www.cibc.com/content/dam/about_cibc/corporate_governance/pdfs/code-of-conduct-en.pdf
RESPONSIBLE INVESTING POLICYCIBC Asset Management  - Created on: 2021-08-09 16:50:02; URL - https://www.cibc.com/content/dam/cam-public-assets/documents/cibc-cam-our-approach-responsible-investment-en.pdf
Sustainability Report 2020 MENUMENU 1.0 Overview1. - Created on: 2021-03-19 10:17:48; URL - https://www.cibc.com/content/dam/about_cibc/corporate_responsibility/pdfs/cibc-esg-2020-en.pdf
Canadian Imperial Bank of Commerce (CIBC) - Climat - Created on: 2020-08-27 14:52:47; URL - https://www.cibc.com/content/dam/cibc-public-assets/about-cibc/corporate-responsibility/environment/documents/cibc-cdp-climate-change-response-2020-en.pdf
CIBC Supplier Code of ConductPurpose Our vision is - Created on: 2020-08-14 10:00:38; URL - https://www.cibc.com/ca/pdf/about/supplier-code-of-conduct-en.pdf
MODERN SLAVERY ACT

In [41]:
for txt, meta in dct_new_pdfs.items():

    print(f"{txt} - Created on: {meta['created_at']}; "
          f"URL - {meta['url']}") 