In [4]:
"""
Utility code
"""
# pylint: disable-msg=invalid-name, broad-except, unused-variable
# pylint: disable-msg=len-as-condition, no-else-return, undefined-variable
# pylint: disable-msg=too-many-return-statements, superfluous-parens
# pylint: disable=R1714
import urllib.parse
import os
import requests
import bs4

######### DO NOT CHANGE THIS CODE  #########

def get_request(url):
    '''
    Open a connection to the specified URL and if successful
    read the data.

    Inputs:
        url: must be an absolute URL

    Outputs:
        request object or None

    Examples:
        get_request("http://www.cs.uchicago.edu")
    '''

    if is_absolute_url(url):
        try:
            r = requests.get(url)
            if r.status_code == 404 or r.status_code == 403:
                r = None
        except Exception:
            # fail on any kind of error
            r = None
    else:
        r = None

    return r


def read_request(request):
    '''
    Return data from request object.  Returns result or "" if the read
    fails..
    '''

    try:
        return request.text.encode('iso-8859-1')
    except Exception:
        print("read failed: " + request.url)
        return ""


def get_request_url(request):
    '''
    Extract true URL from the request
    '''
    return request.url


def is_absolute_url(url):
    '''
    Is url an absolute URL?
    '''
    if url == "":
        return False
    return urllib.parse.urlparse(url).netloc != ""


def remove_fragment(url):
    '''remove the fragment from a url'''

    (url, frag) = urllib.parse.urldefrag(url)
    return url


def convert_if_relative_url(current_url, new_url):
    '''
    Attempt to determine whether new_url is a relative URL and if so,
    use current_url to determine the path and create a new absolute
    URL.  Will add the protocol, if that is all that is missing.

    Inputs:
        current_url: absolute URL
        new_url:

    Outputs:
        new absolute URL or None, if cannot determine that
        new_url is a relative URL.

    Examples:
        convert_if_relative_url("http://cs.uchicago.edu", "pa/pa1.html") yields
            'http://cs.uchicago.edu/pa/pa.html'

        convert_if_relative_url("http://cs.uchicago.edu", "foo.edu/pa.html")
            yields 'http://foo.edu/pa.html'
    '''
    if new_url == "" or not is_absolute_url(current_url):
        return None

    if is_absolute_url(new_url):
        return new_url

    parsed_url = urllib.parse.urlparse(new_url)
    path_parts = parsed_url.path.split("/")

    if len(path_parts) == 0:
        return None

    ext = path_parts[0][-4:]
    if ext in [".edu", ".org", ".com", ".net"]:
        return "http://" + new_url
    elif new_url[:3] == "www":
        return "http://" + new_url
    else:
        return urllib.parse.urljoin(current_url, new_url)


ARCHIVES = ("https://www.classes.cs.uchicago.edu/archive/2015/winter"
            "/12200-1/new.collegecatalog.uchicago.edu/thecollege/archives")
LEN_ARCHIVES = len(ARCHIVES)


ARCHIVES_HTTP = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
            "/12200-1/new.collegecatalog.uchicago.edu/thecollege/archives")
LEN_ARCHIVES_HTTP = len(ARCHIVES_HTTP)


def is_url_ok_to_follow(url, limiting_domain):
    '''
    Inputs:
        url: absolute URL
        limiting domain: domain name

    Outputs:
        Returns True if the protocol for the URL is HTTP, the domain
        is in the limiting domain, and the path is either a directory
        or a file that has no extension or ends in .html. URLs
        that include an "@" are not OK to follow.

    Examples:
        is_url_ok_to_follow("http://cs.uchicago.edu/pa/pa1", "cs.uchicago.edu")
            yields True

        is_url_ok_to_follow("http://cs.cornell.edu/pa/pa1", "cs.uchicago.edu")
            yields False
    '''

    if "mailto:" in url:
        return False

    if "@" in url:
        return False

    if url[:LEN_ARCHIVES] == ARCHIVES or url[:LEN_ARCHIVES_HTTP] == ARCHIVES_HTTP:
        return False

    parsed_url = urllib.parse.urlparse(url)
    if parsed_url.scheme != "http" and parsed_url.scheme != "https":
        return False

    if parsed_url.netloc == "":
        return False

    if parsed_url.fragment != "":
        return False

    if parsed_url.query != "":
        return False

    loc = parsed_url.netloc
    ld = len(limiting_domain)
    trunc_loc = loc[-(ld+1):]
    if not (limiting_domain == loc or (trunc_loc == "." + limiting_domain)):
        return False

    # does it have the right extension
    (filename, ext) = os.path.splitext(parsed_url.path)
    return (ext == "" or ext == ".html")


def is_subsequence(tag):
    '''
    Does the tag represent a subsequence?
    '''
    return isinstance(tag, bs4.element.Tag) and 'class' in tag.attrs \
        and tag['class'] == ['courseblock', 'subsequence']


def is_whitespace(tag):
    '''
    Does the tag represent whitespace?
    '''
    return isinstance(tag, bs4.element.NavigableString) and (tag.strip() == "")


def find_sequence(tag):
    '''
    If tag is the header for a sequence, then
    find the tags for the courses in the sequence.
    '''
    rv = []
    sib_tag = tag.next_sibling
    while is_subsequence(sib_tag) or is_whitespace(tag):
        if not is_whitespace(tag):
            rv.append(sib_tag)
        sib_tag = sib_tag.next_sibling
    return rv


In [5]:
import queue
import json
import sys
import csv
import re
import bs4
# import util

INDEX_IGNORE = set(['a', 'also', 'an', 'and', 'are', 'as', 'at', 'be',
                    'but', 'by', 'course', 'for', 'from', 'how', 'i',
                    'ii', 'iii', 'in', 'include', 'is', 'not', 'of',
                    'on', 'or', 's', 'sequence', 'so', 'social', 'students',
                    'such', 'that', 'the', 'their', 'this', 'through', 'to',
                    'topics', 'units', 'we', 'were', 'which', 'will', 'with',
                    'yet'])


### YOUR FUNCTIONS HERE

def go(num_pages_to_crawl, course_map_filename, index_filename):
    '''
    Crawl the college catalog and generates a CSV file with an index.

    Inputs:
        num_pages_to_crawl: the number of pages to process during the crawl
        course_map_filename: the name of a JSON file that contains the mapping
          course codes to course identifiers
        index_filename: the name for the CSV of the index.

    Outputs:
        CSV file of the index index.
    '''

    starting_url = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
                    "/12200-1/new.collegecatalog.uchicago.edu/index.html")
    limiting_domain = "classes.cs.uchicago.edu"

    # YOUR CODE HERE

In [6]:
starting_url = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
                    "/12200-1/new.collegecatalog.uchicago.edu/index.html")
limiting_domain = "classes.cs.uchicago.edu"

In [13]:
import bs4 as bs
import pprint
def pull_html(url, limiting_domain):
    if is_url_ok_to_follow(url, limiting_domain):
        if is_absolute_url(url):
            request = get_request(url)
            data = read_request(request)
        return data

html = pull_html(starting_url, limiting_domain)
print(html)

pprint.pprint(html)

b'<!doctype html>\n<html xml:lang="en" lang="en" dir="ltr">\n\n<head>\n<title>The College Catalog 2014-2015 &lt; University of Chicago Catalog</title>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<meta name="description" content="2014-2015 College Catalog" />\n<link rel="search" type="application/opensearchdescription+xml"\n\t\t\thref="thecollege/search/opensearch.xml" title="University of Chicago Catalog" />\n<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0" />\n<link href="thecollege/favicon.ico" rel="shortcut icon" />\n<link rel="stylesheet" type="text/css" href="thecollege/css/reset.css" />\n<link rel="stylesheet" type="text/css" href="thecollege/css/screen.css" media="screen" />\n<link rel="stylesheet" type="text/css" href="thecollege/css/print.css" media="print" />\n<script type="text/javascript" src="js/jquery.js"></script>\n<script type="text/javascript" src="js/lfjs.js"></script>\n<script type="text/javascript" 

In [None]:
from bs4 import BeautifulSoup as bs
def pull_links_from_html(url, limiting_domain, html):
    links_lst = []
    soup = bs(html, 'html.parser')
    links = soup.find_all('a', href = True)
    for link in links:
        print(link)
        href = link.get('href')
        print(href)
        link_convert = convert_if_relative_url(url, href)
        if link_convert and is_url_ok_to_follow(link_convert, limiting_domain):
            links_lst.append(link_convert)
    
    return list(set(links_lst))

links_lst = pull_links_from_html(starting_url, limiting_domain, html)
links_lst

<a href="index.html#content" rel="section">Skip to Content</a>
index.html#content
<a href="azindex/index.html">AZ Index</a>
azindex/index.html
<a href="index.html">Catalog Home</a>
index.html
<a href="http://www.uchicago.edu">Institution Home</a>
http://www.uchicago.edu
<a href="http://college.uchicago.edu">The University of Chicago</a>
http://college.uchicago.edu
<a href="index.html">2014-2015 Catalog</a>
index.html
<a class="cl-menu-btn" href="index.html#">Toggle Navigation</a>
index.html#
<a class="stickytop" href="index.html#header">Back to top</a>
index.html#header
<a href="thecollege/introduction/index.html">Introduction</a>
thecollege/introduction/index.html
<a href="thecollege/programsofstudy.1.html">Programs of Study</a>
thecollege/programsofstudy.1.html
<a href="thecollege/thecurriculum.1.html">The Curriculum</a>
thecollege/thecurriculum.1.html
<a href="thecollege/academicregulationsprocedures/index.html">Academic Regulations and Procedures</a>
thecollege/academicregulationsp

['http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/interdisciplinaryopportunities/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/offcampusstudyprograms.1.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/researchopportunities/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/introduction/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/preparationforprofessionalstudy/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreeprograms/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/academicregulationsprocedures

In [None]:
def queue_unique_url(master_urls_lst, page_urls_lst, used_urls):
    for url in page_urls_lst:
        if url not in master_urls_lst and url not in used_urls:
            print(f"{url} is unique")
            master_urls_lst.append()
        else:
            print(f"{url} is not unique")

In [None]:
def unqueue_used_url(master_urls_lst, current_url):
    used_urls = []
    used_urls.append(master_urls_lst)
    master_urls_lst.pop
    print(f"Current URL has been removed from the list: {current_url}")
    return used_urls

In [None]:
# def pull_course_title(assembled_urls):
#     first_url = assembled_urls[0]
#     page_response = requests.get(first_url)
#     soup = bs(page_response.content, 'html.parser')

#     course_blocks = soup.find_all('p', class_='courseblocktitle')
    
#     course_codes = []
#     for block in course_blocks:
#         # Skip if this is part of a subsequence
#         if is_subsequence(block.parent):
#             continue
            
#         strong_text = block.find('strong').text
        
#         if strong_text:
#             code = strong_text.split('.')[0].strip()
#             code_cleaned = code.replace('\xa0', ' ')
            
#             if '-' in code_cleaned:
#                 # Split sequence into individual codes
#                 dept = code_cleaned.split()[0]  
#                 numbers = code_cleaned.split()[1].split('-')  
#                 for num in numbers:
#                     course_codes.append(f"{dept} {num}")
#             else:
#                 course_codes.append(code_cleaned)

#     return course_codes

# title_lst = pull_course_title(assembled_urls)
# title_lst    

In [None]:
def pull_course_code(master_url_lst, limiting_domain):
    # html = pull_html(master_url_lst[0], limiting_domain)
    html = pull_html(master_url_lst, limiting_domain)
    soup = bs(html, 'html.parser')

    course_blocks = soup.find_all('p', class_='courseblocktitle')
    print(course_blocks)

    course_codes = []
    if course_blocks:
        for block in course_blocks:
            if is_subsequence(block.parent):
                continue
            
            strong_text = block.find('strong').text

            if strong_text:
                code = strong_text.split('.')[0].strip()
                code_cleaned = code.replace('\xa0', ' ')

                if '-' in code_cleaned:
                    dept = code_cleaned.split()[0]
                    number_sequence = code_cleaned.split()[1]
                    numbers = number_sequence.split('-')
                    for num in numbers:
                        course_codes.append(f"{dept} {num}")
                else:
                    course_codes.append(code_cleaned)
    else:
        print(f"No course blocks found in {master_url_lst}")
    
    return course_codes

limiting_domain = "classes.cs.uchicago.edu"
links_lst = "https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/anthropology/index.html"
title_lst = pull_course_code(links_lst, limiting_domain)
title_lst

    

[<p class="courseblocktitle"><strong>ANTH 20100.  The Inka and Aztec States.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 20405.  Anthropology of Disability.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 20535.  The Social Life of Clean Energy.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 20701-20702.  Introduction to African Civilization I-II.</strong></p>, <p class="courseblocktitle"><strong>ANTH 20701.  Introduction to African Civilization I.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 20702.  Introduction to African Civilization II.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 21015.  Media, Culture, and Society.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 21102.  Classical Readings in Anthropology: History and Theory of Human  Evolution.  100 Units.</strong></p>, <p class="courseblocktitle"><strong>ANTH 21107.  Classical Readings in Anthropology: Ant

['ANTH 20100',
 'ANTH 20405',
 'ANTH 20535',
 'ANTH 20701',
 'ANTH 20702',
 'ANTH 21015',
 'ANTH 21102',
 'ANTH 21107',
 'ANTH 21201',
 'ANTH 21217',
 'ANTH 21225',
 'ANTH 21230',
 'ANTH 21251',
 'ANTH 21254',
 'ANTH 21255',
 'ANTH 21264',
 'ANTH 21265',
 'ANTH 21303',
 'ANTH 21305',
 'ANTH 21322',
 'ANTH 21401',
 'ANTH 21406',
 'ANTH 21420',
 'ANTH 21610',
 'ANTH 21725',
 'ANTH 22000',
 'ANTH 22105',
 'ANTH 22123',
 'ANTH 22125',
 'ANTH 22130',
 'ANTH 22150',
 'ANTH 22205',
 'ANTH 22400',
 'ANTH 22530',
 'ANTH 22535',
 'ANTH 22606',
 'ANTH 22609',
 'ANTH 22710',
 'ANTH 22715',
 'ANTH 22910',
 'ANTH 23101',
 'ANTH 23102',
 'ANTH 23103',
 'ANTH 23600',
 'ANTH 23620',
 'ANTH 23630',
 'ANTH 23715',
 'ANTH 23805',
 'ANTH 24001',
 'ANTH 24002',
 'ANTH 24003',
 'ANTH 24101',
 'ANTH 24102',
 'ANTH 24315',
 'ANTH 24320',
 'ANTH 24511',
 'ANTH 24512',
 'ANTH 24705',
 'ANTH 24800',
 'ANTH 25103',
 'ANTH 25116',
 'ANTH 25117',
 'ANTH 25200',
 'ANTH 25305',
 'ANTH 25310',
 'ANTH 25325',
 'ANTH 254

In [None]:
def pull_course_title(master_url_lst, limiting_domain):
    