# Web crawler
### This will extract the constituents list of the ASX Gold Companies, fetch market capitalisation and website addresses before searching for ASX announcements relating to exploration programs.
### Brendan Garner - April 2021

The web crawler will use a combination of Selenium and Gazpacho to extract the information from the internet. The information will be stored in a Pandas DataFrame.

#### Import the required modules and methods

In [3]:
import re
import time
import pandas as pd
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import InvalidSelectorException
from gazpacho import Soup, get

#### Install the Selenium WebDriver for Python from https://www.selenium.dev/downloads/ and a driver for the browser of choice. The driver for Firefox can be downloaded from https://github.com/mozilla/geckodriver/releases

In [4]:
def open_browser():
    options = Options()
    options.headless = True
    browser = Firefox(executable_path=r"C:\Program Files\Mozilla Firefox\geckodriver.exe", options=options)

    return browser


def get_url(browser_instance, url, delay):
    browser_instance.get(url)
    time.sleep(delay)

    return browser_instance

#### The constituents of the ASX Gold Companies can be extracted from https://www.listcorp.com/asx/sectors/materials/materials/metals-mining/gold

In [30]:
# get constituent list of ASX Gold Companies
url = "https://www.listcorp.com/asx/sectors/materials/materials/metals-mining/gold"
browser_instance = open_browser()
browser = get_url(browser_instance, url, 3)

# scrape web page
soup = Soup(browser.page_source)

# find hyperlinks on the page with the HTML <a> tag
tags = soup.find("a")

# only keep tags that include an ASX code
tags = [str(tag) for tag in tags if 'ASX:' in str(tag)]

# extract all ASX codes from tags
asx_codes = [tag.split('ASX:', 1)[1][0:3] for tag in tags]
print("ASX codes extracted for gold companies")
for asx_code in asx_codes:
    print(asx_code)

ASX codes extracted for gold companies
NCM
NST
EVN
CHN
DEG
RRL
SLR
PRU
RMS
SBM
GOR
WGX
WAF
BGL
RSG
AGG
ADN
RED
EMR
ALK
SSR
DCN
PNR
TBR
DGO
FFX
CYL
ORR
OBM
KCN
MML
CAI
OGC
IXR
AUT
BDC
TIE
GCY
BCN
TGM
WMX
GMD
BSX
TTM
WWI
RTG
BBX
TSO
AOP
RND
PDI
GWR
TUL
BRV
TAM
BC8
HMX
OKU
BRB
VAN
AAR
CAY
HRZ
ARL
AUC
DGR
NUS
MAN
SIH
FML
TRY
GML
KLA
GBZ
STN
E2M
AME
PEC
DRE
YRL
HRN
KRM
ERM
TNR
VKA
AGS
DTR
GME
AQI
BNZ
CTO
NAG
KAU
ODY
GMN
SMG
AWV
BSR
ONX
GMR
ARE
TAR
A1G
CGN
VEC
PRX
MKG
ICL
MAT
SNG
LNY
RGL
NXM
KWR
MM8
HAW
OAU
SAU
DCX
MSR
NPM
REZ
ZAG
AAU
MDI
KTA
CY5
ANL
SMI
BTR
NSM
GUL
DTM
PUA
LLO
AUN
BNR
ODM
GBR
NES
KGM
AGC
MRZ
GSM
ALY
MLS
M2R
AVW
PVW
BDG
OKR
OZM
CDT
GED
STK
NTL
PGD
AAJ
MOH
TMX
KGD
AWJ
SFM
MEG
WSR
DMG
MDX
CVS
GCR
ATM
CGM
A8G
CSM
FG1


#### Extract web Address and other information for each company from the ASX

In [33]:
# get market cap data from the ASX
full_names = []
market_caps = []
web_addresses = []
for code in asx_codes:
    url = 'https://www.asx.com.au/asx/1/company/' + code + '?fields=primary_share'
    html = get(url)
    full_name = html.get('name_full')
    web_address = html.get('web_address')
    market_cap_dict = html.get('primary_share')
    market_cap = market_cap_dict.get('market_cap')
    full_names.append(full_name)
    market_caps.append(market_cap)
    web_addresses.append(web_address)
    print(code, full_name, web_address)  # sorted by market capitalisation

NCM NEWCREST MINING LIMITED http://www.newcrest.com.au/
NST NORTHERN STAR RESOURCES LTD http://www.nsrltd.com
EVN EVOLUTION MINING LIMITED http://www.evolutionmining.com.au
CHN CHALICE MINING LIMITED http://www.chalicemining.com
DEG DE GREY MINING LIMITED http://www.degreymining.com.au
RRL REGIS RESOURCES LIMITED http://www.regisresources.com/
SLR SILVER LAKE RESOURCES LIMITED http://www.silverlakeresources.com.au
PRU PERSEUS MINING LIMITED http://www.perseusmining.com/
RMS RAMELIUS RESOURCES LIMITED http://www.rameliusresources.com.au
SBM ST BARBARA LIMITED http://www.stbarbara.com.au/
GOR GOLD ROAD RESOURCES LIMITED http://www.goldroad.com.au/
WGX WESTGOLD RESOURCES LIMITED. http://www.westgold.com.au
WAF WEST AFRICAN RESOURCES LIMITED http://www.westafricanresources.com
BGL BELLEVUE GOLD LIMITED https://www.bellevuegold.com.au
RSG RESOLUTE MINING LIMITED http://www.rml.com.au
AGG ANGLOGOLD ASHANTI LIMITED http://www.anglogoldashanti.com
ADN ANDROMEDA METALS LIMITED http://www.androm

GSM GOLDEN STATE MINING LIMITED http://goldenstatemining.com.au/
ALY ALCHEMY RESOURCES LIMITED http://www.alchemyresources.com.au
MLS METALS AUSTRALIA LTD http://www.metalsaustralia.com.au
M2R MIRAMAR RESOURCES LIMITED https://www.miramarresources.com.au
AVW AVIRA RESOURCES LTD http://www.mgt.net.au
PVW PVW RESOURCES LIMITED None
BDG BLACK DRAGON GOLD CORP. http://www.blackdragongold.com
OKR OKAPI RESOURCES LIMITED http://www.okapiresources.com
OZM OZAURUM RESOURCES LIMITED https://ozaurumresources.com/
CDT CASTLE MINERALS LIMITED http://www.castleminerals.com
GED GOLDEN DEEPS LIMITED. http://www.goldendeeps.com
STK STRICKLAND METALS LIMITED http://www.alloyres.com
NTL NEW TALISMAN GOLD MINES LIMITED http://www.newtalismangoldmines.co.nz
PGD PEREGRINE GOLD LTD https://peregrinegold.com.au/
AAJ ARUMA RESOURCES LIMITED http://www.arumaresources.com
MOH MOHO RESOURCES LIMITED http://www.mohoresources.com.au
TMX TERRAIN MINERALS LIMITED http://www.terrainminerals.com.au
KGD KULA GOLD LIMIT

#### Store this information in a Pandas DataFrame

In [34]:
data = {'ASX code':  asx_codes, 'Full name': full_names, 'Market cap': market_caps, 'Web address': web_addresses}
gold_companies = pd.DataFrame(data, columns=['ASX code', 'Full name', 'Market cap', 'Web address'])
# gold_companies = gold_companies[gold_companies['Market cap'] < 1000000000].reset_index(drop=True)  # not used - more data, more better!

#### Not all web addresses can be retrieved from the ASX. Google can supply the rest. A headless browser (not visible) is used

In [37]:
missed_web_addresses = gold_companies[gold_companies['Web address'].isnull()]['Full name']
driver = open_browser()
web_addresses = []
for company in missed_web_addresses:
    url = "https://www.google.com"
    browser = get_url(driver, url, 3)

    # enter the company name into a google search bar and press enter
    browser.find_element(By.NAME, "q").send_keys(company + Keys.ENTER)

    # wait until the page is loaded before attempting to find an element
    wait = WebDriverWait(browser, 3)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'g')))

    # get the search results and append to a list
    searchResults = browser.find_elements_by_class_name('g')
    top_result = searchResults[0].find_element_by_css_selector('a').get_attribute('href')
    web_addresses.append(top_result)

browser.quit()

#### Add the web addresses obtained from Google to the existing DataFrame and save to CSV for review 

In [40]:
missed = gold_companies.loc[gold_companies['Full name'].isin(missed_web_addresses)]
missed['Web address'] = web_addresses
gold_companies.loc[gold_companies['Full name'].isin(missed['Full name']), 'Web address'] = missed['Web address']
gold_companies.to_csv('gold_companies.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Some functions are defined for the purposes of web scraping. The first function will attempt to get a response from the website

In [5]:
def get_web_address(web_address):
    try:
        # print(web_address)
        response = get(web_address)
    except:
        try:
            web_address = web_address.split('www.', 1)[0] + web_address.split('www.', 1)[1]
        except:
            return None

        try:
            response = get(web_address)

        except HTTPError as error:
            response = requests.get(web_address)
            content = str(response.content)
            links = re.findall(r'(https?://\S+)', content)
            domain = web_address.split('//', 1)[1]
            links = [link for link in links if '.com' in link and 'google' not in link and domain not in link]
            links = [re.findall(r"\w+://\w+\.\w+\.\w+/?[\w\.\?=#]*", link) for link in links]
            new_link = [link for sublist in links for link in sublist]

            try:
                response = get(new_link[0])
                return response

            except:
                return None

    return response

#### The following function uses Gazpacho to extract hyperlinks and associated text

In [6]:
def find_tags(r):
    soup = Soup(r)
    soup = soup.find('a')

    # find hyperlinks and associated text
    try:
        tags = [tag.attrs['href'] for tag in soup if 'href' in tag.attrs]
        texts = [tag.text for tag in soup if 'href' in tag.attrs]

    except:
        tags_list = []
        texts_list = []
        for tag in soup:
            try:
                links = [tag for tag in tag.attrs.values() if tag.startswith('http')]
                if len(links) > 0:
                    texts = [tag for tag in tag.attrs.values() if not tag.startswith('http')]
            except:
                pass
            tags_list.append(links)
            texts_list.append(texts)

        # create a Pandas Dataframe. Drop rows with no tags and drop duplicates
        data = {'tag':  tags_list, 'text': texts_list}
        link_data = pd.DataFrame(data, columns=['tag', 'text'])
        link_data['tag'] = link_data['tag'].str[0]  # remove square brackets
        link_data['text'] = link_data['text'].str[2]  # the associated tag text is stored in position [2]
        link_data = link_data[link_data['tag'].notna()]
        link_data = link_data.drop_duplicates()
        link_data = link_data.drop_duplicates(subset=['tag'], keep='last')
        link_data['text'] = link_data['text'].astype(str)  # convert 'nan' from float to string
        tags = link_data['tag'].tolist()
        texts = link_data['text'].tolist()

    return tags, texts

#### This function attempts to find the web page for ASX announcments on each company's website

In [7]:
def clean_tags(tags, web_address):
    # get hyperlinks that include 'ASX' that exclude exchange links. Get links for 'investor
    tags_asx = [tag for tag in tags if 'ASX'.lower() in tag.lower() and 'asx.com.au'.lower() not in tag]
    tags_investor = [tag for tag in tags if 'investor'.lower() in tag and 'analyst'.lower() not in tag]

    # get hyperlinks that start with 'https' and 'http'
    tags_asx_https = [tag for tag in tags_asx if tag.startswith('https')]
    tags_asx_http = [tag for tag in tags_asx if tag.startswith('http') and tag not in tags_asx_https]
    tags_investor_https = [tag for tag in tags_investor if tag.startswith('https')]
    tags_investor_http = [tag for tag in tags_investor if tag.startswith('http') and tag not in tags_investor_https]

    # prefer 'https' links over the same 'http' link
    if len(tags_asx_https) > 0:
        tags_asx = tags_asx_https
    elif len(tags_asx_http) > 0:
        tags_asx = tags_asx_http

    if len(tags_investor_https) > 0:
        tags_investor = tags_investor_https
    elif len(tags_investor_http) > 0:
        tags_investor = tags_investor_http

    # links that include 'ASX' are preferred over 'investor'
    if len(tags_asx) > 0:
        tags = tags_asx
    elif len(tags_investor) > 0:
        tags = tags_investor

    # remove some links
    tags = [tag for tag in tags if 'broker-research' not in tag and 'page' not in tag and 'research-reports' not in tag
            and 'translation' not in tag and 'twitter' not in tag and 'presentation' not in tag
            and 'youtube' not in tag and 'annual-report' not in tag and 'asx.com.au' not in tag
            and 'linkedin.com' not in tag and 'share-price' not in tag and 'facebook.com' not in tag]

    # add anchor to the link if missing and remove duplicates
    tags = [web_address + tag if tag.startswith('/') else tag for tag in tags]
    tags = list(set(tags))
    return tags

#### Iterate through every gold company in the ASX Gold Companies and attempt to find the ASX announcements page. Export the dataset to CSV for further review

In [6]:
# import data into Pandas DataFrame
gold_companies = pd.read_csv("gold_companies.csv")
announcement_links = []
for web_address in gold_companies['Web address']:
    try:
        r = get_web_address(web_address)
        if r is not None:
            all_tags, all_texts = find_tags(r)
            tags = clean_tags(all_tags, web_address)
            announcement_links.append(tags[0])
            print(tags[0])
        else:
            announcement_links.append("NAN")

    except:
        print(web_address, "is taking too long to load")
        announcement_links.append("NAN")

# add announcement links to the gold_companies DataFrame
gold_companies['Announcements'] = pd.Series(announcement_links)
gold_companies.to_csv('gold_companies_links.csv', index=False)

https://icrm.indigotools.com/IR/IAC/?Ticker=NCM&Exchange=ASX
https://www.nsrltd.com/contact/subscribe-to-investor-announcements/
https://evolutionmining.com.au/wp-content/uploads/2020/06/ASX-Announcemnt-4-June-2020.pdf
http://www.chalicemining.com/investors
https://degreymining.com.au/wp-content/uploads/2020/08/20200821-DEG-ASX-HEMI-Brolga-drilling-update-lodgement-final-clean.pdf
https://regisresources.com.au/investor-centre/asx-announcements/
https://www.silverlakeresources.com.au/investors/asx-announcements
http://www.perseusmining.com//asx-announcement/
https://www.rameliusresources.com.au/2021-asx/
https://stbarbara.com.au/wp-content/uploads/2021/03/2021.03.24-asx-appendix-3y-gleeson.pdf#new_tab
https://goldroad.com.au/wp-content/uploads/2021/03/Gold-Road-Sustainability-Report-2020_asx.pdf
http://www.westgold.com.au/site/investor-centre/asx-announcements
https://www.westafricanresources.com/investor-centre/
https://www.bellevuegold.com.au/asx-reports
https://www.rml.com.au/investo

https://dartmining.com.au/asx_announcements/lidar-data-acquisition-over-strategic-projects/
https://www.peakminerals.com.au//view/investors/asx-announcements
https://liononemetals.com/investors/
https://aurumin.com.au/investors/asx-announcements/
https://www.bulletinresources.com/investor-media-centre/asx-annoucements/
https://odinmetals.com.au/investor-resources/asx-announcements/
http://www.greatboulder.com.au/sites/default/files/asx-announcements/61029496.pdf
https://nelsonresources.com.au/asxannouncements/
http://www.kalnorthgoldmines.com/irm/content/asx-announcements.aspx?RID=8
https://www.austgoldcopper.com.au/asx-announcements/
http://www.montroyalres.com is taking too long to load
https://goldenstatemining.com.au/investor-centre/
http://www.alchemyresources.com.au is taking too long to load
investors.php
https://www.miramarresources.com.au/investors/asx-announcements/
https://www.onlydomains.com/hosting/?utm_medium=free_parking&utm_source=mgt.net.au
https://pvwresources.com.au/

#### The correct link for the ASX announcements page for some companies will need to be collected manually, as the script is not perfect. Once cleaned, it can be imported as gold_companies_to_scrape.csv

#### An attempt will be made to navigate through the website of every ASX Gold Company to find all ASX announcements from 2010 - 2021 that include the words 'drill', 'exploration' or 'discovery' in the announcement title. Search for mutiple pages for every calendar year. This will not work with websites that use javascript heavily. This section will be improved with time.

#### This function will identify any hyperlinks with the text of interest

In [8]:
def get_drill_tags(tags, texts, all_drill_tags, landing_page):
    for tag, text in zip(tags, texts):
        text = text.lower()
        if 'drill' in text or 'exploration' in text or 'result' in text or 'discovery' in text or\
                'sulphide' in text or 'intersect' in text and\
                'meeting' not in text and 'financial' not in text and 'half' not in text and 'share' not in text and\
                'agm' not in text and 'egm' not in text and 'entitlement' not in text\
                and 'project' not in text and 'placement' not in text and text not in all_drill_tags:

            tag = make_link(tag, landing_page)
            all_drill_tags.append(tag)
    return all_drill_tags

#### This function will make a proper hyperlink out of any partial link found

In [9]:
def make_link(page, web_page):
    if page.startswith('/site') or page.startswith('/announcement') or page.startswith('/media') or\
            page.startswith('/upload') or page.startswith('/download') or page.startswith('/asx-release') or\
            page.startswith('/investor-report') or page.startswith('/wp-content') or page.startswith('/s') or\
            page.startswith('/investor-centre') or page.startswith('/assets') or page.startswith('/news'):
        protocol_domain = web_page.split('/site', 1)[0]
        page = protocol_domain + page
    if page.startswith('download'):
        protocol_domain = web_page.split('/site', 1)[0]
        page = protocol_domain + '/' + page
    return page

#### This function attempts to identify multiple pages of ASX announcements for that calendar year

In [10]:
def find_more_pages(tags):
    more_pages = [tag for tag in tags if re.search("page[0-9]", tag.lower()) or re.search("/P[0-9]", tag) or
                  re.search("page=[0-9]", tag.lower())]
    if len(more_pages) > 1:
        more_pages = list(set(more_pages))
    return more_pages

#### This function looks for new page links every time a new page is loaded. Not all links may be available on the first page.

In [11]:
def recursive_page_finder(pages_found, more_pages, web_page, all_drill_tags):
    new_pages = []
    for page in more_pages:
        drill_tags, new_tags = navigate_to_more_pages(page, web_page, all_drill_tags)
        hidden_pages = find_more_pages(new_tags)  # further page links may be revealed upon subsequent page loadings
        new_hidden_pages = [page for page in hidden_pages if page not in pages_found and page not in new_pages]
        new_pages.append(new_hidden_pages)
    new_pages = [item for sublist in new_pages for item in sublist]
    if len(new_pages) > 1:
        new_pages = list(set(new_pages))
    return drill_tags, new_pages, len(new_hidden_pages)

#### This function will navigate the website

In [12]:
def navigate_to_more_pages(page, web_page, all_drill_tags):
    # navigate to any other page and get tags that include 'drill' etc.
    page = make_link(page, web_page)
    new_response = get_web_address(page)
    new_tags, new_texts = find_tags(new_response)
    drill_tags = get_drill_tags(new_tags, new_texts, all_drill_tags, landing_page)
    return drill_tags, new_tags

#### This function will get all relevant tags for each calendar year

In [13]:
def search_calendar_year(page, landing_page):
    response = get_web_address(page)
    try:
        tags, texts = find_tags(response)
    except:
        return None
    drill_tags = get_drill_tags(tags, texts, all_drill_tags, landing_page)  # get all tags on first page
    more_pages = find_more_pages(tags)  # find out if there are other pages and remove duplicate links
    if len(more_pages) == 0:
        return drill_tags, tags, texts

    found = more_pages
    drill_tags, new_pages, count_hidden_pages = recursive_page_finder(found, more_pages, web_page, all_drill_tags)

    while count_hidden_pages > 0:
        found = found + new_pages
        drill_tags, new_pages, count_hidden_pages = recursive_page_finder(found, new_pages, web_page, all_drill_tags)

    return drill_tags, tags, texts

#### Only websites that could be scraped successfully were used. Better error handling is required and a method to extract data from mutiple websites that use javascript needs to be developed.

In [24]:
# import dataset into Pandas Dataframe
dataset = pd.read_csv("gold_companies_to_scrape.csv")
company_names = dataset['Full name']
announcements_pages = dataset['Announcements']
landing_pages = dataset['Web address']

In [25]:
all_announcements = []
for landing_page, web_page in zip(landing_pages, announcements_pages):
    all_drill_tags = []
    # get tags that include 'drill' for the current calendar year
    drill_tags, tags, texts = search_calendar_year(web_page, landing_page)

    # get tags that include 'drill' from 2010 - 2020
    for tag, text in zip(tags, texts):
        if text in ['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010']:
            page = make_link(tag, landing_page)
            try:
                drill_tags, tags, texts = search_calendar_year(page, landing_page)
            except:
                pass

    all_announcements.append(drill_tags)

#### Store the data in a Pandas DataFrame. Drop duplicates and remove unwanted links. Save to CSV for further review.

In [26]:
# create a Pandas DataFrame of announcement links and save to csv
data = {'company name': company_names, 'announcement':  all_announcements}
all_announcements = pd.DataFrame(data, columns=['company name', 'announcement'])
all_announcements = all_announcements.explode('announcement')
all_announcements.dropna(inplace=True)
all_announcements = all_announcements[all_announcements['announcement'].str.contains('/file|pdf|PDF')]
all_announcements = all_announcements[~all_announcements['announcement'].str.
                                      contains('(?i)asx.com.au|annual|agm|egm|half|dividend|presentation|financial|'
                                               'feasibility|full|meeting|spp|renounceable|entitlement|rights|'
                                               'share|placement|metallurgical|scoping|appointment|nativetitle')]
all_announcements = all_announcements.drop_duplicates()
all_announcements.to_csv("gold_companies_announcements.csv",  index=False)
print(all_announcements.head())

                 company name  \
0  RAMELIUS RESOURCES LIMITED   
0  RAMELIUS RESOURCES LIMITED   
0  RAMELIUS RESOURCES LIMITED   
0  RAMELIUS RESOURCES LIMITED   
0  RAMELIUS RESOURCES LIMITED   

                                        announcement  
0  https://www.rameliusresources.com.au/wp-conten...  
0  https://www.rameliusresources.com.au/wp-conten...  
0  https://www.rameliusresources.com.au/wp-conten...  
0  https://www.rameliusresources.com.au/wp-conten...  
0  https://www.rameliusresources.com.au/wp-conten...  


#### Import the CSV after review. Save announcements in PDF to disc (if working offline) . Load each PDF and extract text

#### Install methods from the PDFminer.six modules needed for text extraction

In [35]:
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from nltk.corpus import stopwords
import io

In [30]:
# import dataset into Pandas Dataframe
dataset = pd.read_csv("gold_companies_announcements.csv")
company_names = dataset['company name']
announcements_pages = dataset['announcement']

#### This function will save each PDF

In [None]:
def save_pdfs(announcements, folder):
    for counter, announcement in enumerate(announcements):
        print(counter, announcement)
        filename = Path(os.path.join(folder, str(counter) + '.pdf'))
        response = requests.get(announcement)
        filename.write_bytes(response.content)
    return

#### This function will extract the text from each PDF

In [32]:
def get_text(folder, pdf):
    # response = requests.get(announcement)
    # file = io.BytesIO(response.content)  # needed if reading PDF from URL
    file = os.path.join(folder, pdf)
    with open(file, 'rb') as open_file:
        parser = PDFParser(open_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        output_string = io.StringIO()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        text = output_string.getvalue()
    return text

#### This function will create a corpus

In [33]:
def form_corpus(corpus, text, announcements_dict_list):
    text = text.split()  # split string into individual words and store in list
    text = [word.replace(' ', '') for word in text]  # remove spaces
    text = [word for word in text if word]  # remove empty strings in list
    text = [re.sub('[^a-zA-Z]', ' ', word) for word in text]  # remove punctuation
    text = [word for word in text if word]  # remove empty strings in list
    text = [word.lower() for word in text]  # convert to lower case
    text = [re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", word) for word in text]  # remove tags
    text = [word for word in text if word.islower()]  # remove strings with only spaces
    text = [word.strip() for word in text]  # remove leading and trailing spaces
    text = ' '.join(str(e) for e in text)
    corpus.append(text)
    announcements_dict = {'text': text}
    announcements_dict_list.append(announcements_dict)
    return corpus, announcements_dict_list

#### This function will save outputs to JSON

In [34]:
def save_to_json(corpus, announcements_dict_list):
    for filename, item in zip(['corpus', 'all_announcements'], [corpus, announcements_dict_list]):
        with open(filename + '.json', 'w') as fp:
            json.dump(item, fp)

#### Loop through all PDFs and extract the text

In [None]:
# get the text from all PDFs and form corpus
corpus = []
announcements_dict_list = []
stop_words = set(stopwords.words("english"))  # create a set of stopwords
files = os.listdir(folder)
for file in files[0:1]:
    print(file)
    text = get_text(folder, file)

corpus, announcements_dict_list = form_corpus(corpus, text, announcements_dict_list)  # all the text from every announcement

#### Save the corpus and each announcement's text as a list of dictionaries

In [None]:
save_to_json(corpus, announcements_dict_list)