## Data Scraping Homework Answers

#### Goal: Identify and download annual reports from http://www.annualreports.com

In [36]:
import os
import time
import requests
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML

output_dir_path = r'C:\Users\[INSERT_YOUR_USERNAME]\ml_guild\project\raw_data\oracle'
base_url = r'http://www.annualreports.com'
company = r'oracle-corporation'

### 1. Collect all of the urls for the numerous annual reports


In [20]:
# create a string with the url to the webpage with the selected companies annual reports
# use python string formatting .format() to combine the base_url with the company

company_url = r'{}/Company/{}'.format(base_url,company)
company_url

'http://www.annualreports.com/Company/oracle-corporation'

In [44]:
# view the source data website

iframe = '<iframe src={} width=950 height=300></iframe>'.format(company_url)
HTML(iframe)

In [21]:
# request the html from the company_url
r = requests.get(company_url)
r

<Response [200]>

In [22]:
# convert the text from the request into a BeautifulSoup instance
b = BeautifulSoup(r.text, 'lxml')

In [23]:
# find all of the links to the pdfs - <a href="example_url.pdf">
# look for any tags that contains all of the links
# we will extract the links in the next exercise

annual_reports = b.find_all('ul', attrs={'class':'links'})
annual_reports

[<ul class="links ">
 <li><a href="/Click/24624" onclick="window.open('/Click/24624', '_blank'); return false;" target="_blank"><span>PDF</span> <span class="icon-download"></span></a></li>
 <li><a href="/Click/123" onclick="window.open('/Click/123', '_blank'); return false;" target="_blank"><span>Form 10K</span> <span class="report-type-label">(HTML)</span> <span class="icon-download"></span></a></li></ul>,
 <ul class="links">
 <li>2015 Annual Report</li>
 <li><a href="/HostedData/AnnualReportArchive/o/NYSE_ORCL_2015.pdf" target="_blank" title="View 2015 Annual Report PDF">PDF <span class="icon-download"></span></a></li>
 </ul>,
 <ul class="links">
 <li>2014 Annual Report</li>
 <li><a href="/HostedData/AnnualReportArchive/o/NYSE_ORCL_2014.pdf" target="_blank" title="View 2014 Annual Report PDF">PDF <span class="icon-download"></span></a></li>
 </ul>,
 <ul class="links">
 <li>2013 Annual Report</li>
 <li><a href="/HostedData/AnnualReportArchive/o/NYSE_ORCL_2013.pdf" target="_blank" tit

In [32]:
# create an empty list to store the urls
urls = []

# iterate through the annual_reports
for report in annual_reports:
    # find the report url ending
    report_name = report.find('a')['href']
    # combine the base_url with the report name to create the full url
    # consider using the string method: join
    report_url = ''.join([base_url,report_name])
    # append each report_url to the urls list 
    urls.append(report_url)

# view the first 5 results
urls[0:5]

['http://www.annualreports.com/Click/24624',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_2015.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_2014.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_2013.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_2012.pdf']

### 2. Create a mapping a file paths with the filename and filepath (to store each file locally)
Store all files in a folder on your desktop called ml_guild/raw_data/[company_name]
- NOTE: replace [company_name] with the selected companies name


In [34]:
# create an empty dict to store file paths
# name is output_paths
output_paths = {}

# iterate through the urls
# consider using enumerate to get the index of the url in the list
for ind, url in enumerate(urls):

    # parse the year from the annual report report_name
    # split and slice the url to extract the year
    year = url.split('_')[-1].split('.')[0]

    # The first annual report on a page is stored in different html
    # and does not have the year in the report name
    # e.g. ('Click/[#]') instead of ('NYSE_ORCL_2015.pdf')
    # add a condition to identify the url with index 0 and
    # add one to the year of the next annual report to get the correct year
    # you will need to convert a string to an int for the addition
    if ind == 0:
        previous_year = urls[1].split('_')[-1].split('.')[0] 
        year = str(int(previous_year) + 1)

    # create a file name 
    # use the naming scheme companyname_annual_report_year
    # use .format() to replace the companyname and year for each report
    filename = '{}_annual_report_{}.pdf'.format(company, year)
    
    # create a local filepath to identify how to name a file
    # and where to store it locally
    filepath = os.path.join(output_dir_path,filename)
    
    # add each url to the output_paths dict
    output_paths[url] = filepath

output_paths

{'http://www.annualreports.com/Click/24624': 'C:\\Users\\alsherman\\PycharmProjects\\annual_report\\raw_data\\oracle\\oracle-corporation_annual_report_2016.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_1994.pdf': 'C:\\Users\\alsherman\\PycharmProjects\\annual_report\\raw_data\\oracle\\oracle-corporation_annual_report_1994.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_1995.pdf': 'C:\\Users\\alsherman\\PycharmProjects\\annual_report\\raw_data\\oracle\\oracle-corporation_annual_report_1995.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_1996.pdf': 'C:\\Users\\alsherman\\PycharmProjects\\annual_report\\raw_data\\oracle\\oracle-corporation_annual_report_1996.pdf',
 'http://www.annualreports.com/HostedData/AnnualReportArchive/o/NYSE_ORCL_1997.pdf': 'C:\\Users\\alsherman\\PycharmProjects\\annual_report\\raw_data\\oracle\\oracle-corporation_annual_report_1997.pdf',
 'http://www.annualreports.com

### 3. Download all of the annual reports

Before writing any code check the http://www.annualreports.com/robots.txt to ascertain any data collection restrictions

In [39]:
requests_url = 'http://www.annualreports.com/robots.txt '
iframe = '<iframe src={} width=500 height=300></iframe>'.format(requests_url)
HTML(iframe)

In [None]:
# iterate through the urls
for url in urls:
    
    # required delay, stated in the robots.txt
    time.sleep(10)  # pause for ten seconds

    # download the pdf with requests
    r = requests.get(url)

    # get the path of where to download the pdf locally
    # use the url to get the filepath from the output_paths dict
    filepath = output_paths[url]
    
    # write the pdf to the filepath 
    with open(filepath, 'wb') as f:
        f.write(r.content)