# Automate the Grantee Website Monitoring

In [1]:
#Load Packages
import requests
import csv
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import progressbar
from time import sleep

## Step 1: Make a Python list of URLs to check

In [2]:
#Create a List of websites to monitor by typing them in to a CSV file stored in the same directory as this program.
#This list is named csv_urls.

filename = open('test.csv', 'r')
file = csv.DictReader(filename)
csv_urls = {}

for row in file:
    url = row['url']
    if "http" not in url:
        csv_urls[row['name']] = "https://" + url
    elif "https" not in url:
        csv_urls[row['name']] = "https://" + url[7:]
    else:
        csv_urls[row['name']] = url

# len(csv_urls)
# print(csv_urls)

4715

## Step 2: Run through the URLs to see if they are active

In [3]:
#function get_url_status uses the requests package (installed above) to find status of site
#HTTP response status codes of 400 and above may be problems for the Program Office.
#Informational responses (100–199) Successful responses (200–299) Redirects (300–399)
#Client errors (400–499) Server errors (500–599)

#ensure not on VPN when running this

def get_url_status(urls):  # checks status for each url in dict of urls    
    url_statuses = {} #maps {name of school: (url, status code, request.get)}
    #create a progress bar to monitor progress through urls
    bar = progressbar.ProgressBar(maxval=len(urls), widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    done = 0
    for school_name in urls.keys():
        for i in range(3):
            while True:
                try:
                    r = requests.get(urls[school_name], timeout=15)
                    url_statuses[school_name] = (urls[school_name], str(r.status_code), r)
                except Exception as e:
                    url_statuses[school_name] = (urls[school_name], e,)
                break
        done += 1
        bar.update(done)
    bar.finish()
    return url_statuses

# url_statuses = get_url_status(csv_urls) #uncomment to generate trial csv
# print(get_url_status(csv_urls))



In [4]:
#takes in a list of urls and uses the get_url_status function to return a dict of active urls
#dict of the form {name of school: (url, status code, request.get)}

def get_active_urls(urls):
    url_statuses = get_url_status(urls) #comment to generate trial csv
    active_urls = {}
    inactive_urls = {}
    for school_name in urls.keys():
        if url_statuses[school_name][1] == "200":
            active_urls[school_name] = url_statuses[school_name]
        else:
            inactive_urls[school_name] = url_statuses[school_name][1]
            # print(school_name, urls[school_name])
    if len(inactive_urls) > 0:
#         print("INACTIVE URLS: ", inactive_urls)
        print("NUM INACTIVE: ", len(inactive_urls))
    return active_urls

# get_active_urls(csv_urls)
# active_urls = get_active_urls(csv_urls) #uncomment to generate trial csv

NUM INACTIVE:  1134


In [47]:
'''
403 errors: https://stackoverflow.com/questions/38489386/python-requests-403-forbidden (couldn't figure this out)
'''

"\n403 errors: https://stackoverflow.com/questions/38489386/python-requests-403-forbidden (couldn't figure this out)\n"

## Step 3: Download quarterly report pdfs

In [6]:
def findPDF(urls, active_urls):  # finds PDF in site and saves it
    print('getting active urls')
    active_urls = get_active_urls(urls) #comment to generate trial csv
    pdfs = {}
    print('dowloading pdfs')
    no_pdfs = []
    num_pdfs = {}
    
    bar = progressbar.ProgressBar(maxval=len(urls), widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    done = 0
    
    for school_name in active_urls.keys():
        # print(school_name)
        # print(active_urls[school_name][0])
        number = 0
        r = active_urls[school_name][2]
        # Parse text obtained
        soup = BeautifulSoup(r.text, 'html.parser')
        links = soup.find_all('a')
        for link in links:
            # pick up only those URLs containing 'pdf'
            # within 'href' attribute
            if "quarterly" in str(link) or "Quarterly" in str(link):
                number += 1
                # init PDF url
                pdf_url = ''
                # append base URL if no 'https' available in URL
                if 'https' not in link.get('href'):
                    pdf_url = urljoin(active_urls[school_name][0], link.get('href'))

                # otherwise use bare URL
                else:
                    pdf_url = link.get('href')
                
                pdf_url = pdf_url.replace(" ", "%20")

                # print('HTTP GET: ', pdf_url)

                # extract PDF file name
                filename = "testpdfs/" + school_name.replace(' ', '_') + "__" + pdf_url.split('/')[-1].replace('%20','_')
                # print(filename)
                
                # write PDF to local file
                pdf_errors = {}
                try:
                    pdf = urlopen(pdf_url)
                    file = open(filename, 'wb')
                    file.write(pdf.read())
                    file.close()
                except:
                    if link in pdf_errors:
                        pdf_errors[link].append(filename)
                    else:
                        pdf_errors[link] = [filename]
        if number == 0:
            no_pdfs.append(school_name)
        num_pdfs[school_name] = number
        done += 1
        bar.update(done)
    bar.finish()
#     return num_pdfs #uncomment to generate trial csv
    print("PDF ERRORS: ", pdf_errors)
    print("NO PDFS FOUND: ", no_pdfs)
    print("NUMBER PDFS:", num_pdfs)
    
# number = findPDF(csv_urls, active_urls) #uncomment to generate trial csv

[                                                                        ]   0%

getting active urls
dowloading pdfs


KeyboardInterrupt: 

In [8]:
#need to distinguish between institutional and student reports somehow?

In [8]:
# #generate csv with results from trial
# trial = []
# # trial = [{'school_name': i, 'status': url_statuses[i][1]} for i in url_statuses.keys()]
# for school_name in csv_urls.keys():
#     school_dict = {'school_name': school_name}
#     if school_name in active_urls.keys():
#         school_dict['status'] = url_statuses[school_name][1]
#         school_dict['active_url'] = 1
# #         school_dict['num_pdfs'] = number[school_name]
#     else:
#         school_dict['status'] = url_statuses[school_name][1]
#         school_dict['active_url'] = 0
#         school_dict['num_pdfs'] = 0    
#     trial.append(school_dict)

# fieldnames = ['school_name', 'status', 'active_url', 'num_pdfs']

# with open("trial.csv", 'w', encoding='UTF8', newline='') as f:
#     writer = csv.DictWriter(f, fieldnames=fieldnames)
# #     writer = csv.DictWriter(f, fieldnames=['school_name', 'status'])
#     writer.writeheader()
#     writer.writerows(trial)

In [10]:
findPDF(csv_urls)

[                                                                        ]   0%

getting active urls


[                                                                        ]   0%

INACTIVE URLS:  {'ILISAGVIK COLLEGE': '403', 'ALABAMA STATE UNIVERSITY': '404', 'ATHENS STATE UNIVERSITY': '403', 'AUBURN UNIVERSITY AT MONTGOMERY': '403', 'BIRMINGHAM-SOUTHERN COLLEGE': '406', 'BROWN BEAUTY BARBER SCHOOL': ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.covid-relief-data.ed', port=443): Max retries exceeded with url: /gov (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001B804ED7B80>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))")), 'DONALD S MATHEWS DBA AL ST COLLEGE OF BARBER STYLING': SSLError(MaxRetryError('HTTPSConnectionPool(host=\'alabamastatebarbercollege.com\', port=443): Max retries exceeded with url: /heerf.html (Caused by SSLError(SSLCertVerificationError("hostname \'alabamastatebarbercollege.com\' doesn\'t match either of \'*.accountservergroup.com\', \'accountservergroup.com\'")))')), 'GEORGE CORLEY WALLACE STATE COMMUNITY COLLEGE': '404', 'H. COUNCILL TRENHOLM STATE COMMUNITY CO



PDF ERRORS:  {<a href="https://www.judson.edu/wp-content/uploads/2021/04/HEERF-3-31-21-quarterly-report-4-7-2021.pdf">Quarterly Report – 3.31.2021</a>: ['testpdfs/JUDSON_COLLEGE__HEERF-3-31-21-quarterly-report-4-7-2021.pdf']}
NO PDFS FOUND:  ['ALASKA CAREER COLLEGE', 'ALASKA PACIFIC UNIVERSITY', 'ALABAMA A&M UNIVERSITY', 'ALABAMA SCHOOL OF NAIL TECHNOLOGY AND COSMETOLOGY INC', 'AUBURN UNIVERSITY', 'BEVILL STATE COMMUNITY COLLEGE', 'BISHOP STATE COMMUNITY COLLEGE', 'CARDIAC AND VASCULAR INSTITUTE OF ULTRASOUND, INC.', 'CHATTAHOOCHEE VALLEY COMMUNITY COLLEGE', 'GADSDEN STATE COMMUNITY COLLEGE', 'HERITAGE CHRISTIAN UNIVERSITY', 'J. F. DRAKE STATE COMMUNITY AND TECHNICAL COLLEGE', 'J.F. INGRAM STATE TECHNICAL COLLEGE', 'LAWSON STATE COMMUNITY COLLEGE', 'LURLEEN B. WALLACE COMMUNITY COLLEGE', 'MARION MILITARY INSTITUTE', 'MIDFIELD INSTITUTE OF COSMETOLOGY INC', 'MILES COLLEGE', 'NORTHEAST ALABAMA COMMUNITY COLLEGE', 'NORTHWEST-SHOALS COMMUNITY COLLEGE']
NUMBER PDFS: {'ALASKA BIBLE COLLEGE':


