In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
import docx
import os
import pandas as pd
import re
import requests

In [7]:
matt_path=Path(os.getcwd())
root_path=matt_path.parents[0]
pdf_path=root_path.joinpath(r'data\apra_standards\pdf')
word_path=root_path.joinpath(r'data\apra_standards\word')

Path(pdf_path).mkdir(parents=True, exist_ok=True)
Path(word_path).mkdir(parents=True, exist_ok=True)

print(pdf_path)
print(word_path)

d:\repos\ChatBot-Adv-NLP\data\apra_standards\pdf
d:\repos\ChatBot-Adv-NLP\data\apra_standards\word


In [3]:
def get_link_list(main_page_url,re_pattern=False):
    response = requests.get(main_page_url)
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Find all links that match the pattern of legislation.gov.au links
    legislation_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if re_pattern==False:
            legislation_links.append(href)
        else: 
            if re.match(re_pattern, href):
                legislation_links.append(href)
    
    return legislation_links

def return_standard_id(url):
    pattern = r'/Details/(F\d{4}L\d+)'
    match = re.search(pattern, url)
    if match:
        return re.search(pattern, url).group(1)
    else:
        return None
    
link_list=get_link_list(r'https://www.apra.gov.au/industries/1/standards',r'https://www.legislation.gov.au/Details/F\w+')
standard_id_list=[return_standard_id(url) for url in link_list]
len(standard_id_list)

91

In [4]:
# Function to extract title and date for a given standard_id
def extract_metadata(standard_id):
    # Construct the URL based on the standard_id
    url = f'https://www.legislation.gov.au/{standard_id}/latest/downloads'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Extract date from 'dcterms.date'
    meta_date = soup.find('meta', attrs={'name': 'dcterms.date'})
    if meta_date:
        content_date = meta_date.get('content', '')
        date = content_date.split('; ')[1][:10]  # Extracts the date in 'YYYY-MM-DD' format
    else:
        date = None
    
    # Extract full title from 'dcterms.title'
    meta_title = soup.find('meta', attrs={'name': 'dcterms.title'})
    if meta_title:
        title = meta_title.get('content', '')  # Extracts the full title content
    else:
        title = None
    
    url_pdf=fr'https://www.legislation.gov.au/{standard_id}/asmade/{date}/text/original/pdf'
    url_word=fr'https://www.legislation.gov.au/{standard_id}/asmade/{date}/text/original/word'

    return {"id":standard_id, "title":title, "date":date, 'url_pdf':url_pdf, 'url_word':url_word}

In [8]:
metadata_list = [extract_metadata(standard_id) for standard_id in standard_id_list]

In [None]:
for metadata_dict in metadata_list:
    print('=== Downloading ===')
    print(metadata_dict)
    
    id       = metadata_dict['id']
    url_pdf  = metadata_dict['url_pdf']
    url_word = metadata_dict['url_word']

    response = requests.get(url_pdf)
    # Save the file to the specified folder
    with open(pdf_path.joinpath(fr'{id}.pdf'), 'wb') as file:
        file.write(response.content)

    response = requests.get(url_word)
    # Save the file to the specified folder
    with open(word_path.joinpath(fr'{id}.docx'), 'wb') as file:
        file.write(response.content)

=== Downloading ===
{'id': 'F2016L01437', 'title': 'Banking, Insurance and Life Insurance (prudential standard) determination No. 4 of 2016 - Prudential Standard 3PS 310 - Audit and Related Matters', 'date': '2016-09-14', 'url_pdf': 'https://www.legislation.gov.au/F2016L01437/asmade/2016-09-14/text/original/pdf', 'url_word': 'https://www.legislation.gov.au/F2016L01437/asmade/2016-09-14/text/original/word', 'subtitle': 'Prudential Standard 3PS 310 Audit and Related Matters', 'doc_title': 'Prudential_Standard_3PS_310_Audit_and_Related_Matters'}
=== Downloading ===
{'id': 'F2022L01572', 'title': 'Banking (prudential standard) determination No. 16 of 2022', 'date': '2022-12-05', 'url_pdf': 'https://www.legislation.gov.au/F2022L01572/asmade/2022-12-05/text/original/pdf', 'url_word': 'https://www.legislation.gov.au/F2022L01572/asmade/2022-12-05/text/original/word', 'subtitle': 'Prudential Standard APS 310 Audit and Related Matters', 'doc_title': 'Prudential_Standard_APS_310_Audit_and_Related

In [9]:
# Function to extract the subtitle from a .docx file
def extract_subtitle(doc_path):
    doc = docx.Document(doc_path)
    
    # Extract the paragraphs
    paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
    
    if len(paragraphs) >= 3:
        # Assuming the third paragraph is the subtitle
        subtitle = paragraphs[1].strip()  # Get the third non-empty paragraph
        return subtitle
    else:
        return None


In [13]:
metadata_list[0]['id']

'F2016L01437'

In [14]:
final_metadata_list=[]

# Iterate through list of dictionaries
for metadata_dict in metadata_list:

    id=metadata_dict['id']

    # Extract subtitle from .docx file
    subtitle=extract_subtitle(word_path.joinpath(fr'{id}.docx'))

    # Clean subtitle to prepare to use it
    doc_title=re.sub("[^A-Za-z0-9]","_",subtitle)
    doc_title=re.sub("__","_",doc_title)

    # Store metadata for reference
    metadata_dict['subtitle']=subtitle
    metadata_dict['doc_title']=doc_title
    final_metadata_list.append(metadata_dict)

metadata_df = pd.DataFrame(metadata_list)
metadata_df.to_csv(root_path.joinpath('apra_standards_metadata.csv'))

In [18]:
# Iterate through list of dictionaries
for metadata_dict in metadata_list:

    id=metadata_dict['id']
    doc_title=metadata_dict['doc_title']

    # rename .docx and pdf files
    if os.path.exists(word_path.joinpath(fr'{id}.docx')):
        os.rename(word_path.joinpath(fr'{id}.docx'), word_path.joinpath(fr'{id}-{doc_title}.docx'))
    if os.path.exists(pdf_path.joinpath(fr'{id}.pdf')):
        os.rename(pdf_path.joinpath(fr'{id}.pdf'), pdf_path.joinpath(fr'{id}-{doc_title}.pdf'))

In [19]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
metadata_df

Unnamed: 0,id,title,date,url_pdf,url_word,subtitle,doc_title
0,F2016L01437,"Banking, Insurance and Life Insurance (prudential standard) determination No. 4 of 2016 - Prudential Standard 3PS 310 - Audit and Related Matters",2016-09-14,https://www.legislation.gov.au/F2016L01437/asmade/2016-09-14/text/original/pdf,https://www.legislation.gov.au/F2016L01437/asmade/2016-09-14/text/original/word,Prudential Standard 3PS 310 Audit and Related Matters,Prudential_Standard_3PS_310_Audit_and_Related_Matters
1,F2022L01572,Banking (prudential standard) determination No. 16 of 2022,2022-12-05,https://www.legislation.gov.au/F2022L01572/asmade/2022-12-05/text/original/pdf,https://www.legislation.gov.au/F2022L01572/asmade/2022-12-05/text/original/word,Prudential Standard APS 310 Audit and Related Matters,Prudential_Standard_APS_310_Audit_and_Related_Matters
2,F2023L00160,Banking (prudential standard) determination No. 1 of 2023,2023-02-28,https://www.legislation.gov.au/F2023L00160/asmade/2023-02-28/text/original/pdf,https://www.legislation.gov.au/F2023L00160/asmade/2023-02-28/text/original/word,Prudential Standard APS 330 Public Disclosure,Prudential_Standard_APS_330_Public_Disclosure
3,F2018L00509,Banking (prudential standard) determination No.3 of 2018,2018-04-24,https://www.legislation.gov.au/F2018L00509/asmade/2018-04-24/text/original/pdf,https://www.legislation.gov.au/F2018L00509/asmade/2018-04-24/text/original/word,Prudential Standard APS 330 Public Disclosure,Prudential_Standard_APS_330_Public_Disclosure
4,F2023L01535,"Banking, Insurance, Life Insurance and Health Insurance (prudential standard) determination No. 1 of 2023",2023-11-22,https://www.legislation.gov.au/F2023L01535/asmade/2023-11-22/text/original/pdf,https://www.legislation.gov.au/F2023L01535/asmade/2023-11-22/text/original/word,Prudential Standard CPS 510 Governance,Prudential_Standard_CPS_510_Governance
5,F2023L01348,"Banking, Insurance, Life Insurance, Health Insurance and Superannuation (prudential standard) determination No. 3 of 2023",2023-09-29,https://www.legislation.gov.au/F2023L01348/asmade/2023-09-29/text/original/pdf,https://www.legislation.gov.au/F2023L01348/asmade/2023-09-29/text/original/word,Prudential Standard CPS 511 Remuneration,Prudential_Standard_CPS_511_Remuneration
6,F2018L01390,"Banking, Insurance, Life Insurance and Health Insurance (prudential standard) determination No.2 of 2018",2018-10-02,https://www.legislation.gov.au/F2018L01390/asmade/2018-10-02/text/original/pdf,https://www.legislation.gov.au/F2018L01390/asmade/2018-10-02/text/original/word,Prudential Standard CPS 520 Fit and Proper,Prudential_Standard_CPS_520_Fit_and_Proper
7,F2019L00669,"Banking, Insurance, Life Insurance and Health Insurance (prudential standard) determination No.1 of 2019",2019-05-07,https://www.legislation.gov.au/F2019L00669/asmade/2019-05-07/text/original/pdf,https://www.legislation.gov.au/F2019L00669/asmade/2019-05-07/text/original/word,Prudential Standard CPS 220 Risk Management,Prudential_Standard_CPS_220_Risk_Management
8,F2022L01576,Banking (prudential standard) determination No. 14 of 2022,2022-12-05,https://www.legislation.gov.au/F2022L01576/asmade/2022-12-05/text/original/pdf,https://www.legislation.gov.au/F2022L01576/asmade/2022-12-05/text/original/word,Prudential Standard APS 220 Credit Risk Management,Prudential_Standard_APS_220_Credit_Risk_Management
9,F2022L00217,Financial Sector (Collection of Data) (reporting standard) determination No. 5 of 2022,2022-03-01,https://www.legislation.gov.au/F2022L00217/asmade/2022-03-01/text/original/pdf,https://www.legislation.gov.au/F2022L00217/asmade/2022-03-01/text/original/word,Reporting Standard ARS 220.0 Credit Quality,Reporting_Standard_ARS_220_0_Credit_Quality
