## Evaluate latest files (for Personal only)

In [0]:
dbutils.fs.ls('dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal')

[FileInfo(path='dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal/FormNPS.html', name='FormNPS.html', size=240976, modificationTime=1690184441000),
 FileInfo(path='dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal/_jcr_content/', name='_jcr_content/', size=0, modificationTime=1690190100424),
 FileInfo(path='dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal/accounts/', name='accounts/', size=0, modificationTime=1690190100424),
 FileInfo(path='dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal/accounts.html', name='accounts.html', size=424230, modificationTime=1690184643000),
 FileInfo(path='dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal/buy-now-pay-later/', name='buy-now-pay-later/', size=0, modificationTime=1690190100424),
 FileInfo(path='dbfs:/nab_demo/input_files_v2/extracted/nabrwd/en/personal/buy-now-pay-later.html', name='buy-now-pay-later.html', size=426426, modificationTime=1690184442000),
 FileInfo(path='dbfs:/nab_demo/input_files_

In [0]:
#Copy the latest set of files received from NAB to local folder for processing 
#We are only focusing on (Personal) and later we can expand it with other files

%cp '/dbfs/nab_demo/input_files_v2/extracted/nabrwd/en/personal' personal_v2 --recursive

## HTML Parsing for NAB Files

In [0]:
%ls

 [0m[01;32mHtml_Parsers[0m*   [01;32mREADME.md[0m*              [01;32mWeb_Crawl_Nab_Au_V02[0m*
[34;42m'Local Dev'[0m/     [01;32mWeb_Crawl_Nab_Au_V01[0m*   [34;42mpersonal[0m/


## Evaluate input filepath and hierarchy

This is just a temporary script to evaluate how each files are presented

In [0]:
import os 

file_list = []

for root, _, files in os.walk('personal_v2'):
  for filename in files:
    #Remove any files that are hidden
    if filename.startswith("."):
      continue
    else:
      html_file_path = os.path.join(root,filename)
      output_filename = f'parsed-v2-{filename}'
      output_path = os.path.join(root,output_filename)
      print(output_path)
      file_list.append(filename)

print(f'Total files present : {len(file_list)}')

personal_v2/parsed-v2-FormNPS.html
personal_v2/parsed-v2-accounts.html
personal_v2/parsed-v2-buy-now-pay-later.html
personal_v2/parsed-v2-calculators-and-financial-tools.html
personal_v2/parsed-v2-credit-cards.html
personal_v2/parsed-v2-home-loans.html
personal_v2/parsed-v2-insurance.html
personal_v2/parsed-v2-interest-rates-fees-and-charges.html
personal_v2/parsed-v2-international-banking.html
personal_v2/parsed-v2-life-moments.html
personal_v2/parsed-v2-mobile-onboarding-offers.html
personal_v2/parsed-v2-online-banking.html
personal_v2/parsed-v2-personal-loans.html
personal_v2/parsed-v2-private-wealth.html
personal_v2/parsed-v2-super-and-investments.html
personal_v2/parsed-v2-youth-banking.html
personal_v2/_jcr_content/root/banner/image.coreimg.90.2500.jpeg/1689739621490/parsed-v2-man-enjoying-in-swimming-pool-banner-3000x1000.jpeg
personal_v2/_jcr_content/root/responsivegrid_2088157678/section_container_co/parsed-v2-responsivegrid.html
personal_v2/accounts/parsed-v2-cheque-payments.

### HTML Parsing via Beautiful Soup

There are few duplicates across all files in terms of repeat hyperlinks. The logic of the code is something below:
1. Traverse all directories
2. Load each html file inside the directory. 
3. Remove any hyperlinks (Note: Hyperlink section is repititive in each file related to other page buttons and references)
4. Remove any special characters
5. Preserve anynewline characters
6. Remove images and embedded objects (if any) (Majority looked as logos)
7. Store the clean file as an html object with 'utf-8' format.
8. Store each parse file in "html" format in the respective repository next to the input file

The python library "BeautifulSoup" and "re" is used for the parsing.

In [0]:
from bs4 import BeautifulSoup
import re

def remove_hyperlinks(soup):
    # Find all anchor tags (hyperlinks) in the HTML content
    for a_tag in soup.find_all('a'):
        # Remove the entire anchor tag (hyperlink element) from the HTML content
        a_tag.extract()

def remove_special_characters_and_spaces(text):
    # Remove special characters using regular expressions
    cleaned_text = re.sub(r'^[\s•·●▪◦▸\-–—‣⁃▹*]+(?=\s*\d*[.,]?\s*)', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text

def preserve_newlines(soup):
    # Preserve newlines within specific tags that may use newlines for formatting
    for tag in soup.find_all(['pre', 'textarea']):
        tag.insert_before('\n')
        tag.insert_after('\n')
        tag.unwrap()
  
def remove_images_and_objects(soup):
    # Find all mage tags (img) and document tags (embed, object, iframe) and remove them from the HTML content
    for doc_tag in soup.find_all(['img','embed', 'object', 'iframe']):
        doc_tag.extract()

def parse_html_file_sample(file_path,output_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')

    # Step 1: Remove hyperlinks from the HTML content
    remove_hyperlinks(soup)

    # Step 2: Get the updated HTML content without hyperlinks
    html_without_hyperlinks = soup.prettify()

   # Step 3:  Remove special characters from the HTML content
    clean_html_content = remove_special_characters_and_spaces(html_without_hyperlinks)

  # Step 4: Remove images and embedded documents from the HTML content
    soup_without_images_or_documents = BeautifulSoup(clean_html_content, 'html.parser')
    remove_images_and_objects(soup_without_images_or_documents)

 # Preserve newlines within specific tags (pre, textarea) that may use newlines for formatting
    preserve_newlines(soup_without_images_or_documents)

  # Step 5: Remove newline characters from the HTML content
    # final_cleaned_html = remove_newline_characters(soup_without_images_or_documents.get_text())
    final_cleaned_html = soup_without_images_or_documents.get_text()

  # Save the updated HTML content to a new file in the local repository
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(final_cleaned_html)

In [0]:
## Execution 
for root, _, files in os.walk('nab_personal_banking_file_set'):
  for filename in files:
    if filename.endswith("html"):
      html_file_path = os.path.join(root,filename)
      output_filename = f'parsed-v3-{filename}'
      output_path = os.path.join(root,output_filename)
      print(f'The output file name is : {output_path}')
      parse_html_file_sample(html_file_path,output_path)


The output file name is : nab_personal_banking_file_set/parsed-v3-personal-loans.html
The output file name is : nab_personal_banking_file_set/parsed-v3-insurance.html
The output file name is : nab_personal_banking_file_set/parsed-v3-private-wealth.html
The output file name is : nab_personal_banking_file_set/parsed-v3-parsed-v2-personal-loans.html
The output file name is : nab_personal_banking_file_set/parsed-v3-parsed-v2-insurance.html
The output file name is : nab_personal_banking_file_set/parsed-v3-parsed-v2-private-wealth.html
The output file name is : nab_personal_banking_file_set/life-moments/family/buy-car/parsed-v3-car-finance-options.html
The output file name is : nab_personal_banking_file_set/life-moments/family/buy-car/parsed-v3-parsed-v2-car-finance-options.html
The output file name is : nab_personal_banking_file_set/life-moments/family/start-family/parsed-v3-maternity-leave.html
The output file name is : nab_personal_banking_file_set/life-moments/family/start-family/parsed-

## Move to DBFS

In [0]:
%cp  nab_personal_banking_file_set/  '/dbfs/nab_demo/' --recursive

In [0]:
dbutils.fs.ls('dbfs:/nab_demo/nab_personal_banking_file_set/buy-now-pay-later/terms-conditions/')

[FileInfo(path='dbfs:/nab_demo/nab_personal_banking_file_set/buy-now-pay-later/terms-conditions/parsed-v2-resolve-issues.html', name='parsed-v2-resolve-issues.html', size=19468, modificationTime=1689909480000),
 FileInfo(path='dbfs:/nab_demo/nab_personal_banking_file_set/buy-now-pay-later/terms-conditions/resolve-issues.html', name='resolve-issues.html', size=410200, modificationTime=1689909479000)]

In [0]:
%ls /dbfs/nab_demo

[0m[34;42mattachments[0m/  [34;42mnab_attachments[0m/  [34;42mnab_pages[0m/  [34;42mpages[0m/


## Section 2 - LATEST Modified Script to return a pandas dataframe

In [0]:
#Create exclusion list to manually remove the common elements from each files
#This includes both header and footer columns

header_excl_part_1 = "NAB Search Search nab.com.au Search nab.com.au Latest offers close notification" 
header_excl_part_2 = "Update your browser"
header_excl_part_3 = "NAB Mobile Banking app Update your browser." 
header_excl_part_4 = "This website doesn't support your browser and may impact your experience."

header_excl_part_5 = "How was your visit to the NAB website? We’d love to hear from you. New NAB Now Pay Later Link your NAB Classic Banking account to NAB Now Pay Later to split your purchases into four simple repayments and pay no interest or fees. Offer Up to $300 cash back Looking for a credit card that not only offers a low interest rate but provides up to $300 cash back? Offer applies to a new NAB Low Rate Card . $100 cash back per month for the first three months from account opening when you spend $500 per month on purchases. Awarded monthly based on statement period and credited on closing date of statement. Purchases must be processed and charged in the relevant month. Excludes gambling/gaming related transactions. Offer may vary or end at any time. Not available when closing or transferring from another NAB credit card or with other NAB card offer. View calculators $0 international transfer fee Transfer your money securely overseas using NAB Internet Banking or the NAB app. $0 transfer fee when sending in a foreign currency. Discounts and benefits With a NAB business transaction account, you can enjoy discounts, benefits and offers from our partners. Get a fast, simple unsecured loan with NAB QuickBiz. No physical assets required for security and fast access to funds. It's easy to apply online and you'll receive an instant decision. Offer 100,000 NAB Rewards bonus points Earn 100,000 NAB Rewards Bonus Points when you spend $4,000 on everyday business purchases within 60 days of your account opening. Terms and conditions apply. Take payments with NAB Easy Tap Download the NAB Easy Tap app to your Android device for a low-cost, simple and easy way to take contactless card payments. Related tools and help NAB Connect NAB Connect is a powerful online banking solution that offers your business the flexibility of multiple users, advanced reporting and much more. Business product selection, made easy. Explore multiple products all in one place with Small Biz Explorer. The Morning Call Podcast Start your day with the NAB Morning Call Podcast, for the latest overnight key economic and market information straight from our team of experts. The Morning Call Podcast Start your day with the NAB Morning Call Podcast, for the latest overnight key economic and market information straight from our team of experts. More about sustainability Fraud and scams support Troubleshooting guides Fraud and scams support Search nab.com.au Login Internet Banking"

footer_exclusion_blurb = "Any advice on our website has been prepared without considering your objectives, financial situation or needs. Before acting on any advice, consider whether it is appropriate for your circumstances and view the Product Disclosure Statement or Terms and Conditions available online or by contacting us. Credit applications are subject to credit assessment criteria. Interest rates, fees and charges are subject to change. Target Market Determinations for our products are available at . Products issued by NAB unless stated otherwise. © National Australia Bank Limited ABN 12 004 044 937 AFSL and Australian Credit Licence 230686"


In [0]:
#Code to parse the html files  and create a clean set of files to be loaded as a delta table

from bs4 import BeautifulSoup
import re

def remove_hyperlinks(soup):
    # Find all anchor tags (hyperlinks) in the HTML content
    for a_tag in soup.find_all('a'):
        # Remove the entire anchor tag (hyperlink element) from the HTML content
        a_tag.extract()

def remove_special_characters_and_spaces(text):
    # Remove special characters using regular expressions
    cleaned_text = re.sub(r'^[\s•·●▪◦▸\-–—‣⁃▹*]+(?=\s*\d*[.,]?\s*)', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub("\n\n+",' ', cleaned_text )
    
    return cleaned_text

def preserve_newlines(soup):
    # Preserve newlines within specific tags that may use newlines for formatting
    for tag in soup.find_all(['pre', 'textarea']):
        tag.insert_before('\n')
        tag.insert_after('\n')
        tag.unwrap()
  
def remove_images_and_objects(soup):
    # Find all mage tags (img) and document tags (embed, object, iframe) and remove them from the HTML content
    for doc_tag in soup.find_all(['img','embed', 'object', 'iframe']):
        doc_tag.extract()

def remove_specific_text(passed_string , blurb_to_remove):
  #Find and remove the blurb of text from all files
  cleaned_text = passed_string.replace(blurb_to_remove ,'')
  
  return cleaned_text

def parse_html_file_sample(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read() 

     # Remove non-UTF-8 characters
    content = content.encode('utf-8', 'ignore').decode('utf-8')

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')

    # Step 1: Remove hyperlinks from the HTML content
    remove_hyperlinks(soup)

    # Step 2: Get the updated HTML content without hyperlinks
    html_without_hyperlinks = soup.prettify()

   # Step 3:  Remove special characters from the HTML content
    clean_html_content = remove_special_characters_and_spaces(html_without_hyperlinks)

  # Step 4: Remove images and embedded documents from the HTML content
    soup_without_images_or_documents = BeautifulSoup(clean_html_content, 'html.parser')
    remove_images_and_objects(soup_without_images_or_documents)

 # Preserve newlines within specific tags (pre, textarea) that may use newlines for formatting
    preserve_newlines(soup_without_images_or_documents)

  # Step 5: Remove newline characters from the HTML content
    # final_cleaned_html = remove_newline_characters(soup_without_images_or_documents.get_text())
    final_cleaned_html = soup_without_images_or_documents.get_text()

  #Step 6: Remove any additional special characters from the text string 
    without_special = re.sub("\s+",' ', final_cleaned_html )

    # Remove the common text blurb across all files
    for i in [header_excl_part_1,header_excl_part_2,header_excl_part_3,header_excl_part_4,header_excl_part_5,footer_exclusion_blurb]:
      without_special = remove_specific_text(without_special,i)
      
  # # Save the updated HTML content to a new file in the local repository
  #   with open(output_path, 'w', encoding='utf-8') as output_file:
  #       output_file.write(final_cleaned_html)

    # return cleaned_html
    return without_special

### Execution of the code

In [0]:
import pandas as pd
import os

final_result = []
final_filename = []

# Execution 
for root, _, files in os.walk('personal_v2'):
  for filename in files:
    if filename.endswith("html"):
      #Input html file 
      html_file_path = os.path.join(root,filename)
      print(f"input file location is : {html_file_path}")

      # output_path = os.path.join(root,output_filename)
      # print(f'The output file name is : {output_path}')

      parsed_result = parse_html_file_sample(html_file_path)
      final_result.append(parsed_result)
      final_filename.append(filename)

#Create a single pandas dataframe with filename and 
Parsed_output_v2 = pd.DataFrame({'filename':final_filename,'parsed_results':final_result}) 

input file location is : personal_v2/FormNPS.html
input file location is : personal_v2/accounts.html
input file location is : personal_v2/buy-now-pay-later.html
input file location is : personal_v2/calculators-and-financial-tools.html
input file location is : personal_v2/credit-cards.html
input file location is : personal_v2/home-loans.html
input file location is : personal_v2/insurance.html
input file location is : personal_v2/interest-rates-fees-and-charges.html
input file location is : personal_v2/international-banking.html
input file location is : personal_v2/life-moments.html
input file location is : personal_v2/mobile-onboarding-offers.html
input file location is : personal_v2/online-banking.html
input file location is : personal_v2/personal-loans.html
input file location is : personal_v2/private-wealth.html
input file location is : personal_v2/super-and-investments.html
input file location is : personal_v2/youth-banking.html
input file location is : personal_v2/_jcr_content/root

### Sample Validation of Parsed Files

In [0]:
Parsed_output_v2

Unnamed: 0,filename,parsed_results
0,FormNPS.html,FormNPS - NAB Search Search nab.com.au Search...
1,accounts.html,"Bank accounts | Savings, transaction and term..."
2,buy-now-pay-later.html,NAB Now Pay Later | Buy now pay later in four...
3,calculators-and-financial-tools.html,Calculators and Financial Tools - Home Loan C...
4,credit-cards.html,"Credit Cards | Compare our range, fees and ap..."
...,...,...
540,first-debit-card.html,How to use a debit card | Banking for teens -...
541,nab-straightup-card.html,No interest credit card | NAB StraightUp Card...
542,nab-low-rate-card.html,Low rate credit card | Balance transfer or ca...
543,first-home-buyers.html,"First Home Buyers | Special offers, home loan..."


In [0]:
Parsed_output_v2[Parsed_output_v2["filename"] == 'nab-straightup-card.html']['parsed_results'].tolist()[0]

" No interest credit card | NAB StraightUp Card -  Notification: NAB Mobile Banking app .   NAB StraightUp Card NAB StraightUp Card No interest, late payment fees or international transaction fees. Just a simple monthly fee. Apply online in 15 minutes and get a response in 60 seconds. What is the NAB StraightUp Card? Unlike most credit cards, the NAB StraightUp Card comes with zero interest or other charges – just a simple monthly fee and minimum monthly payment based on your credit limit. You’ll also enjoy no fees on international purchases and you can use it anywhere Visa is accepted. The NAB StraightUp Card gives you a set amount of funds (called a 'credit limit’) which you can borrow from at any time. How much credit is available to you will vary depending on how much of your credit limit you have used by making purchases (this is also known as your ‘available balance’). . NAB StraightUp Card features Designed with simplicity and transparency in mind, here are some of the great ben

In [0]:
Parsed_output_v2[Parsed_output_v2["filename"] == 'first-home-buyers.html']['parsed_results'].tolist()[0]

" First Home Buyers | Special offers, home loans, tools and guides -  Notification: NAB Mobile Banking app .   Buying your first home Buying your first home Buying a first home is a huge step. From saving for a house deposit to searching for the perfect property, we can help you get there. Explore your options with a NAB home lending specialist. Why choose NAB to buy your first home NAB has the support and expertise that can help you purchase your first home sooner. Buy or build your first home sooner NAB supports eligible buyers to access the Family Home Guarantee with a deposit as low as 2%. Or, buy or build your first home with a deposit of 5% under the First Home Guarantee or Regional First Home Buyer Guarantee. Home buyer guides Make use of our expertise and support for first home buyers. With our helpful guides about the home buying process, you'll be ready to purchase your first home in no time. Support at every step Purchasing your first home is exciting and we want to be part 

In [0]:
Parsed_output_v2[Parsed_output_v2["filename"] == 'nab-low-rate-card.html']['parsed_results'].tolist()[0]

" Low rate credit card | Balance transfer or cash back offer -  Notification: NAB Mobile Banking app .   NAB Low Rate Credit Card NAB Low Rate Card A simple low interest rate credit card with your choice of offers: get a promotional balance transfer or select our cash back offer. Apply online in 15 minutes and get a response in 60 seconds. Rates and fees Learn more about our standard interest rates and minimum credit limits. Variable purchase rate 12.49% p.a. Interest free days on purchases Up to 55 Annual card fee $59 p.a. Minimum credit limit $1,000 Variable cash advance rate This is the interest rate charged on amounts you withdraw as cash, gambling transactions (including lottery ticket purchases), or transfer from your credit card to another account. 21.74% p.a. Choose one of our available credit card offers Your choice of offers to suit your credit needs, select either our promotional balance transfer offer or our cash back offer. Offer Promotional 0% p.a. balance transfer (BT) f

In [0]:
Parsed_output_v2[Parsed_output_v2['filename'].str.contains('card-based')]['parsed_results'].tolist()[0]

" Electronic banking - card based terms and conditions -  .  Notification: NAB Mobile Banking app  Card based electronic banking NAB Electronic Banking (card based) terms and conditions Effective 01.03.20 Contents Summary of important information Customer Service You can contact us by: calling 13 BANK ( ) and select the option to speak to a Customer Service Representative; visiting our website ; hearing impaired customers with telephone typewriters can contact us via the National Relay Service on . Lost/stolen cards/compromised Personal Identification Numbers (PINs) If your card has been lost/stolen and/or your PIN is compromised or divulged you must immediately notify NAB and provide any information about how the loss/incident occurred by calling one of the following numbers, or using any other means NAB makes available to you. Within Australia Freecall 24 hours, 7 days a week - toll free If Overseas International code Unauthorised transactions If you believe there has been an unautho

### Convert dataframe to spark and store it as a Delta table

In [0]:
from pyspark.sql.functions import *

#Convert the pandas dataframe into a spark datadframe
parsed_spark_df_v2 = spark.createDataFrame(Parsed_output_v2)

# #Save the pyspark dataframe as a delta table in unity catalog 
# parsed_spark_df_v2.write.mode("overwrite").saveAsTable("nab_llm_demo.nab_website_data.parsed_dataset_v2")

#Create a new column to determine the length of the strings
parsed_spark_df_v2 = parsed_spark_df_v2 \
                      .withColumn("length_of_string", length(col('parsed_results'))) 

#Save the pyspark dataframe as a delta table in unity catalog 
parsed_spark_df_v2.write.mode("overwrite").saveAsTable("nab_llm_demo.nab_website_data.parsed_dataset_v2")

print(f'Length of dataframe : {parsed_spark_df_v2.count()}')
parsed_spark_df_v2.orderBy('length_of_string',ascending=False).show(20)

Length of dataframe : 541
+--------------------+--------------------+----------------+
|            filename|      parsed_results|length_of_string|
+--------------------+--------------------+----------------+
|  general-terms.html| General terms fo...|           44379|
|card-based-electr...| Electronic banki...|           41383|
|home-loan-interes...| Home loan intere...|           39644|
|stamp-duty-calcul...| Stamp Duty Calcu...|           34601|
|equity-loan-calcu...| Equity Calculato...|           33546|
|how-things-work.html| Interest, paymen...|           31853|
|nab-qantas-credit...| NAB Qantas credi...|           29490|
|all-home-loan-int...| See all our home...|           29092|
|       managing.html| Managing and res...|           24641|
|digital-card-term...| Important terms ...|           22183|
|how-things-work.html| NAB StraightUp C...|           20964|
|       managing.html| Managing your NA...|           20728|
|nab-choice-packag...| Choice Package |...|           20366

## Alternate code to read html directly and parse based on section_break

In [0]:
from pyspark.sql.functions import split, regexp_replace, explode, posexplode

#Pass it to a temporary dataframe
temp_df = parsed_spark_df_v2

#Split the filename column to array based on "section_break"
df2 = temp_df.select("filename", split(temp_df.parsed_results,"section_break",-1).alias("array_split"))

#Explode the array to rows 
df3 = df2.select("filename", posexplode("array_split"))

#Define section ID for each row
final_df = df3.select("filename", df3.pos.alias("segment_id"), regexp_replace("col","\n","").alias("segment_text"))

#Save the pyspark dataframe as a delta table in unity catalog 
final_df.write.mode("overwrite").saveAsTable("nab_llm_demo.nab_website_data.parsed_dataset_v3")
final_df.show()

+--------------------+----------+--------------------+
|            filename|segment_id|        segment_text|
+--------------------+----------+--------------------+
|        FormNPS.html|         0| FormNPS - NAB Se...|
|       accounts.html|         0| Bank accounts | ...|
|buy-now-pay-later...|         0| NAB Now Pay Late...|
|calculators-and-f...|         0| Calculators and ...|
|   credit-cards.html|         0| Credit Cards | C...|
|     home-loans.html|         0| Home loans | Vie...|
|      insurance.html|         0| Personal insuran...|
|interest-rates-fe...|         0| Personal banking...|
|international-ban...|         0| International ba...|
|   life-moments.html|         0| Life moments | P...|
|mobile-onboarding...|         0| Moboarding - NAB...|
| online-banking.html|         0| Online banking |...|
| personal-loans.html|         0| Personal Loans |...|
| private-wealth.html|         0| NAB Private Weal...|
|super-and-investm...|         0| Superannuation, ...|
|  youth-b

### Archive

In [0]:
# text_blurb_to_remove = "NAB Search Search nab.com.au Search nab.com.au Latest offers close notification Notification: NAB Mobile Banking app Update your browser. This website doesn't support your browser and may impact your experience. How was your visit to the NAB website? We’d love to hear from you. New NAB Now Pay Later Link your NAB Classic Banking account to NAB Now Pay Later to split your purchases into four simple repayments and pay no interest or fees. Offer Up to $300 cash back Looking for a credit card that not only offers a low interest rate but provides up to $300 cash back? Offer applies to a new NAB Low Rate Card . $100 cash back per month for the first three months from account opening when you spend $500 per month on purchases. Awarded monthly based on statement period and credited on closing date of statement. Purchases must be processed and charged in the relevant month. Excludes gambling/gaming related transactions. Offer may vary or end at any time. Not available when closing or transferring from another NAB credit card or with other NAB card offer. View calculators $0 international transfer fee Transfer your money securely overseas using NAB Internet Banking or the NAB app. $0 transfer fee when sending in a foreign currency. Discounts and benefits With a NAB business transaction account, you can enjoy discounts, benefits and offers from our partners. Get a fast, simple unsecured loan with NAB QuickBiz. No physical assets required for security and fast access to funds. It's easy to apply online and you'll receive an instant decision. Offer 100,000 NAB Rewards bonus points Earn 100,000 NAB Rewards Bonus Points when you spend $4,000 on everyday business purchases within 60 days of your account opening. Terms and conditions apply. Take payments with NAB Easy Tap Download the NAB Easy Tap app to your Android device for a low-cost, simple and easy way to take contactless card payments. Related tools and help NAB Connect NAB Connect is a powerful online banking solution that offers your business the flexibility of multiple users, advanced reporting and much more. Business product selection, made easy. Explore multiple products all in one place with Small Biz Explorer. The Morning Call Podcast Start your day with the NAB Morning Call Podcast, for the latest overnight key economic and market information straight from our team of experts. The Morning Call Podcast Start your day with the NAB Morning Call Podcast, for the latest overnight key economic and market information straight from our team of experts. More about sustainability Fraud and scams support Troubleshooting guides Fraud and scams support Search nab.com.au Login Internet Banking"

In [0]:
# import re

# # sample_string = Parsed_output[Parsed_output['filename'] == 'after-auction.html']['parsed_results'].tolist()[0]
# sample_string = Parsed_output_v2[Parsed_output_v2["filename"] == 'nab-straightup-card.html']['parsed_results'].tolist()[0]
# cleanString = re.sub("\n\n+",' ', sample_string )

# #Define the population string
# population = re.sub("\s+",' ', cleanString )
# # sample = re.sub("\s+",' ', text_blurb_to_remove )

# cleaned_output = population.replace(exclusion_blurb ,'')
# print(f"""length of the string : {len(cleaned_output)}""")
# cleaned_output