## Automation Homework Answers

#### Goal: Identify and download annual reports from http://www.annualreports.com

##### Author: Alex Sherman | alsherman@deloitte.com

In [1]:
from configparser import ConfigParser, ExtendedInterpolation
import os
import time
import requests
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML
from urllib import robotparser

### 0. Use configparser to get the following variables

##### IMPORTANT: Make sure to update the config.ini. In the automation section, you need to add your username to the OUTPUT_DIR_PATH

In [3]:
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('../../config.ini')

OUTPUT_DIR_PATH=config['AUTOMATION']['OUTPUT_DIR_PATH']
BASE_URL=config['AUTOMATION']['BASE_URL']
COMPANY=config['AUTOMATION']['COMPANY']

OUTPUT_DIR_PATH

'C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\southwest-airlines-co\\raw_data'

In [12]:
# create a string with the url to the webpage with the selected companies annual reports
# use python string formatting .format() to combine the base_url with the company

company_url = r'{}/Company/{}'.format(BASE_URL, COMPANY)
company_url

'http://www.annualreports.com/Company/southwest-airlines-co'

### 1. Check the robots.txt to confirm access

Before writing any code check the http://www.annualreports.com/robots.txt to ascertain any data collection restrictions

User-agent: * applies to us.

In [13]:
requests_url = 'http://www.annualreports.com/robots.txt '
iframe = '<iframe src={} width=500 height=250></iframe>'.format(requests_url)
HTML(iframe)

In [14]:
rp = robotparser.RobotFileParser()
rp.set_url("http://www.annualreports.com/robots.txt")
rp.read()
rp.can_fetch("*", company_url)

True

### 2. Collect all of the urls for the numerous annual reports


In [15]:
# view the source data website
iframe = '<iframe src={} width=950 height=300></iframe>'.format(company_url)
HTML(iframe)

In [16]:
# request the html from the company_url
r = requests.get(company_url)
r

<Response [200]>

In [17]:
# convert the text from the request into a BeautifulSoup instance
b = BeautifulSoup(r.text, 'lxml')

In [18]:
# find all of the links to the pdfs - <a href="example_url.pdf">
# look for any tags that contains all of the links
# just collect the html, we will extract the links in the next exercise

annual_reports = b.find_all('ul', attrs={'class':'links'})
annual_reports

[<ul class="links ">
 <li><a href="/Click/22333" onclick="window.open('/Click/22333', '_blank'); return false;" target="_blank"><span>PDF</span> <span class="icon-download"></span></a></li>
 <li><a href="/Click/22332" onclick="window.open('/Click/22332', '_blank'); return false;" target="_blank"><span>Interactive</span> <span class="report-type-label">(HTML)</span> <span class="icon-download"></span></a></li>
 <li><a href="/Click/11621" onclick="window.open('/Click/11621', '_blank'); return false;" target="_blank"><span>Form 10K</span> <span class="report-type-label">(HTML)</span> <span class="icon-download"></span></a></li></ul>,
 <ul class="links">
 <li>2015 Annual Report</li>
 <li><a href="/HostedData/AnnualReportArchive/s/NYSE_LUV_2015.pdf" target="_blank" title="View 2015 Annual Report PDF">PDF <span class="icon-download"></span></a></li>
 </ul>,
 <ul class="links">
 <li>2014 Annual Report</li>
 <li><a href="/HostedData/AnnualReportArchive/s/NYSE_LUV_2014.pdf" target="_blank" titl

In [10]:
# create an empty list to store the urls
urls = []

# iterate through the annual_reports
for report in annual_reports:
    # find the report url ending
    report_name = report.find('a')['href']
    # combine the base_url with the report name to create the full url
    # consider using the string method: join
    report_url = ''.join([BASE_URL, report_name])
    # append each report_url to the urls list 
    urls.append(report_url)

# view the first 5 results
urls[0:5]

['http://www.annualreports.com/Click/22333',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_2015.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_2014.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_2013.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_2012.PDF']

### 3. Create a mapping a file paths with the filename and filepath (to store each file locally)
Store all files in a folder on your desktop called ml_guild/raw_data/[company_name]

NOTE: replace [company_name] with the selected companies name


In [12]:
# create an empty dict to store file paths
# name is output_paths
output_paths = {}

# iterate through the urls
# consider using enumerate to get the index of the url in the list
for ind, url in enumerate(urls):

    # parse the year from the annual report report_name
    # split and slice the url to extract the year
    year = url.split('_')[-1].split('.')[0]

    # The first annual report on a page is stored in different html
    # and does not have the year in the report name
    # e.g. ('Click/[#]') instead of ('NYSE_ORCL_2015.pdf')
    # add a condition to identify the url with index 0 and
    # add one to the year of the url in index 1 which is
    # the 2nd most recent annual report
    if ind == 0:
        previous_year = urls[1].split('_')[-1].split('.')[0] 
        year = str(int(previous_year) + 1)

    # create a file name 
    # use the naming scheme {companyname}_annual_{report_year}
    # use .format() to replace the companyname and report_year
    filename = '{}_annual_report_{}.pdf'.format(COMPANY, year)
    
    # create a local filepath to identify how to name a file
    # and where to store it locally
    filepath = os.path.join(OUTPUT_DIR_PATH, filename)
    
    # add each url to the output_paths dict
    output_paths[url] = filepath

output_paths

{'http://www.annualreports.com/Click/22333': 'C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\southwest-airlines-co\\raw_data\\southwest-airlines-co_annual_report_2016.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_1992.pdf': 'C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\southwest-airlines-co\\raw_data\\southwest-airlines-co_annual_report_1992.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_1994.pdf': 'C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\southwest-airlines-co\\raw_data\\southwest-airlines-co_annual_report_1994.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/s/NYSE_LUV_1995.pdf': 'C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\southwest-airlines-co\\raw_data\\southwest-airlines-co_annual_report_1995.pdf',
 'http://www.annualreports.com/Host

### 4. Download all of the annual reports

In [114]:
# iterate through the urls
for url in urls:

    # get the path of where to download the pdf locally
    # use the url to get the filepath from the output_paths dict
    filepath = output_paths[url]

    # download the pdf with requests
    r = requests.get(url)
    
    # write the pdf to the filepath 
    # 'wb' stands for write binary
    with open(filepath, 'wb') as f:
        f.write(r.content)
        
    # required delay, stated in the robots.txt
    time.sleep(5)  # pause for five seconds
