# Reorganizing Code from V2

In [35]:
"""
Utility code
"""
# pylint: disable-msg=invalid-name, broad-except, unused-variable
# pylint: disable-msg=len-as-condition, no-else-return, undefined-variable
# pylint: disable-msg=too-many-return-statements, superfluous-parens
# pylint: disable=R1714
import urllib.parse
import os
import requests
import bs4

######### DO NOT CHANGE THIS CODE  #########

def get_request(url):
    '''
    Open a connection to the specified URL and if successful
    read the data.

    Inputs:
        url: must be an absolute URL

    Outputs:
        request object or None

    Examples:
        get_request("http://www.cs.uchicago.edu")
    '''

    if is_absolute_url(url):
        try:
            r = requests.get(url)
            if r.status_code == 404 or r.status_code == 403:
                r = None
        except Exception:
            # fail on any kind of error
            r = None
    else:
        r = None

    return r


def read_request(request):
    '''
    Return data from request object.  Returns result or "" if the read
    fails..
    '''

    try:
        return request.text.encode('iso-8859-1')
    except Exception:
        print("read failed: " + request.url)
        return ""


def get_request_url(request):
    '''
    Extract true URL from the request
    '''
    return request.url


def is_absolute_url(url):
    '''
    Is url an absolute URL?
    '''
    if url == "":
        return False
    return urllib.parse.urlparse(url).netloc != ""


def remove_fragment(url):
    '''remove the fragment from a url'''

    (url, frag) = urllib.parse.urldefrag(url)
    return url


def convert_if_relative_url(current_url, new_url):
    '''
    Attempt to determine whether new_url is a relative URL and if so,
    use current_url to determine the path and create a new absolute
    URL.  Will add the protocol, if that is all that is missing.

    Inputs:
        current_url: absolute URL
        new_url:

    Outputs:
        new absolute URL or None, if cannot determine that
        new_url is a relative URL.

    Examples:
        convert_if_relative_url("http://cs.uchicago.edu", "pa/pa1.html") yields
            'http://cs.uchicago.edu/pa/pa.html'

        convert_if_relative_url("http://cs.uchicago.edu", "foo.edu/pa.html")
            yields 'http://foo.edu/pa.html'
    '''
    if new_url == "" or not is_absolute_url(current_url):
        return None

    if is_absolute_url(new_url):
        return new_url

    parsed_url = urllib.parse.urlparse(new_url)
    path_parts = parsed_url.path.split("/")

    if len(path_parts) == 0:
        return None

    ext = path_parts[0][-4:]
    if ext in [".edu", ".org", ".com", ".net"]:
        return "http://" + new_url
    elif new_url[:3] == "www":
        return "http://" + new_url
    else:
        return urllib.parse.urljoin(current_url, new_url)


ARCHIVES = ("https://www.classes.cs.uchicago.edu/archive/2015/winter"
            "/12200-1/new.collegecatalog.uchicago.edu/thecollege/archives")
LEN_ARCHIVES = len(ARCHIVES)


ARCHIVES_HTTP = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
            "/12200-1/new.collegecatalog.uchicago.edu/thecollege/archives")
LEN_ARCHIVES_HTTP = len(ARCHIVES_HTTP)


def is_url_ok_to_follow(url, limiting_domain):
    '''
    Inputs:
        url: absolute URL
        limiting domain: domain name

    Outputs:
        Returns True if the protocol for the URL is HTTP, the domain
        is in the limiting domain, and the path is either a directory
        or a file that has no extension or ends in .html. URLs
        that include an "@" are not OK to follow.

    Examples:
        is_url_ok_to_follow("http://cs.uchicago.edu/pa/pa1", "cs.uchicago.edu")
            yields True

        is_url_ok_to_follow("http://cs.cornell.edu/pa/pa1", "cs.uchicago.edu")
            yields False
    '''

    if "mailto:" in url:
        return False

    if "@" in url:
        return False

    if url[:LEN_ARCHIVES] == ARCHIVES or url[:LEN_ARCHIVES_HTTP] == ARCHIVES_HTTP:
        return False

    parsed_url = urllib.parse.urlparse(url)
    if parsed_url.scheme != "http" and parsed_url.scheme != "https":
        return False

    if parsed_url.netloc == "":
        return False

    if parsed_url.fragment != "":
        return False

    if parsed_url.query != "":
        return False

    loc = parsed_url.netloc
    ld = len(limiting_domain)
    trunc_loc = loc[-(ld+1):]
    if not (limiting_domain == loc or (trunc_loc == "." + limiting_domain)):
        return False

    # does it have the right extension
    (filename, ext) = os.path.splitext(parsed_url.path)
    return (ext == "" or ext == ".html")


def is_subsequence(tag):
    '''
    Does the tag represent a subsequence?
    '''
    return isinstance(tag, bs4.element.Tag) and 'class' in tag.attrs \
        and tag['class'] == ['courseblock', 'subsequence']


def is_whitespace(tag):
    '''
    Does the tag represent whitespace?
    '''
    return isinstance(tag, bs4.element.NavigableString) and (tag.strip() == "")


def find_sequence(tag):
    '''
    If tag is the header for a sequence, then
    find the tags for the courses in the sequence.
    '''
    rv = []
    sib_tag = tag.next_sibling
    while is_subsequence(sib_tag) or is_whitespace(tag):
        if not is_whitespace(tag):
            rv.append(sib_tag)
        sib_tag = sib_tag.next_sibling
    return rv


In [36]:
starting_url = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
                    "/12200-1/new.collegecatalog.uchicago.edu/index.html")
limiting_domain = "classes.cs.uchicago.edu"

In [37]:
import bs4 as bs
import pprint
def pull_html(url, limiting_domain):
    '''
    Helper function that pulls in HTML from a given webpage
    
    Args:
        url: Current page URL
        limiting_domain: makes sure root domain matches 
    Outputs:
        html: returns page html
    '''
    if is_url_ok_to_follow(url, limiting_domain):
        if is_absolute_url(url):
            request = get_request(url)
            data = read_request(request)
        return data

html = pull_html(starting_url, limiting_domain)

pprint.pprint(html)

(b'<!doctype html>\n<html xml:lang="en" lang="en" dir="ltr">\n\n<head>\n<title>'
 b'The College Catalog 2014-2015 &lt; University of Chicago Catalog</title>'
 b'\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<'
 b'meta name="description" content="2014-2015 College Catalog" />\n<link rel'
 b'="search" type="application/opensearchdescription+xml"\n\t\t\thref="thecolle'
 b'ge/search/opensearch.xml" title="University of Chicago Catalog" />\n<meta'
 b' name="viewport" content="width=device-width, initial-scale=1.0, minimum-sca'
 b'le=1.0" />\n<link href="thecollege/favicon.ico" rel="shortcut icon" />\n<l'
 b'ink rel="stylesheet" type="text/css" href="thecollege/css/reset.css" />\n'
 b'<link rel="stylesheet" type="text/css" href="thecollege/css/screen.css" medi'
 b'a="screen" />\n<link rel="stylesheet" type="text/css" href="thecollege/cs'
 b's/print.css" media="print" />\n<script type="text/javascript" src="js/jqu'
 b'ery.js"></script>\n<script type="text/javascript

In [38]:
from bs4 import BeautifulSoup as bs
def pull_links_from_html(url, limiting_domain, html):
    '''
    Scrapes links from page and assembles relative URLs
    Args:
        url: Current page URL
        limiting_domain: makes sure root domain matches 
        html: Page URL
    Outputs:
        links_lst: Outputs a list of links pulled from the page
    '''
    links_lst = []
    soup = bs(html, 'html.parser')
    links = soup.find_all('a', href = True)
    for link in links:
        href = link.get('href')
        link_convert = convert_if_relative_url(url, href)
        if link_convert and is_url_ok_to_follow(link_convert, limiting_domain):
            links_lst.append(link_convert)
    
    return list(set(links_lst))

links_lst = pull_links_from_html(starting_url, limiting_domain, html)
links_lst

['http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/preparationforprofessionalstudy/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/offcampusstudyprograms.1.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/interdisciplinaryopportunities/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/researchopportunities/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/thecurriculum.1.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/azindex/index.html',
 'http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/examinationcreditandtransfercredit.1.html',
 'http://ww

In [39]:
def queue_unique_url(queued_urls_lst, page_urls_lst, used_urls):
    '''
    Queues the list of links output by pull_links_from_html and makes
    sure that the urls being appeneded are not already in the queued list
    or used urls list

    Args:
        queued_urls_lst: a list that contains the urls currently queued
        page_urls_lst: an output list from pull_links_from_html
        used_urls: a list that contains urls that have already been used 
    '''
    for url in page_urls_lst:
        if url not in queued_urls_lst and url not in used_urls:
            print(f"{url} is unique")
            queued_urls_lst.append()
        else:
            print(f"{url} is not unique")

In [40]:
def unqueue_used_url(master_urls_lst, page_urls_lst):
    '''
    Unqueues urls from the used url lst
    '''
    used_urls = []
    used_urls.append(master_urls_lst)
    master_urls_lst.pop
    print(f"Current URL has been removed from the list: {current_url}")
    return used_urls

In [41]:


def queue_handler(queue_urls, new_urls, visited_urls):
    '''
    Handles both queuing new URLs and tracking visited URLs 
    
    Args:
        queue_urls: Queue object containing URLs to be processed
        new_urls: List of new URLs found on current page
        visited_urls: Set of URLs that have already been visited
    
    Returns:
        None (modifies queue_urls and visited_urls directly)
    '''
    # Add any new URLs that haven't been seen before
    for url in new_urls:
        # Only add URLs that haven't been queued or visited
        if url not in visited_urls and url not in queue_urls.queue:
            queue_urls.put(url)

In [42]:
from queue import Queue

url_queue = Queue()
visited = set()

url_queue.put(starting_url)

while not url_queue.empty():
    current_url = url_queue.get()
    
    # Only process if we haven't visited this URL
    if current_url not in visited:
        # Get HTML and extract links
        html = pull_html(current_url, limiting_domain)
        new_links = pull_links_from_html(current_url, limiting_domain, html)
        
        # Add current URL to visited
        visited.add(current_url)
        
        # Handle queuing of new URLs
        queue_handler(url_queue, new_links, visited)

In [43]:
def clean_title(title_text):
    '''
    Cleans the title text 

    Args:
        title_text: takes in html title text
    Outputs:
        title_text: cleaned title text
    '''
    match = re.search(r'\.([^.]+)\..*$', title_text)
    if match:
        return match.group(1).strip()
    return title_text

In [44]:
limiting_domain = "classes.cs.uchicago.edu"
links_lst = "https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/anthropology/index.html"

In [45]:
from bs4 import BeautifulSoup as bs
import html5lib

def extract_course_info(url, limiting_domain):
    '''
    Extracts relevant information from the courseblock html element
    
    Args:
        url: current page url
        limiting_domain: makes sure root domain matches 
    
    Outputs:
        A dictionary containing the course code, course title, and course description 
    '''
    html = pull_html(url, limiting_domain)
    soup = bs(html, "html5lib")
    course_blocks = soup.find_all('div', class_='courseblock main')

    course_info = []

    for block in course_blocks:
        title = block.find('p', class_='courseblocktitle')
        desc = block.find('p', class_='courseblockdesc')

        if not title:
            continue

        title_text = title.find('strong').text

        desc_text = re.sub(r'^\n', '', desc.text) if desc else ""
        clean_title_text = clean_title(title_text)

        # Pull main course code
        code = title_text.split('.')[0].strip().replace('\xa0', ' ')

        # Handle sequences 
        if '-' in code:
            dept = code.split()[0]
            numbers = code.split()[1].split('-')
            sequence_codes = [f"{dept} {num}" for num in numbers]

            for seq_code in sequence_codes:
                course_info.append({
                    'code': seq_code,
                    'title': title_text,
                    'desc': desc_text,
                    'is_subsequence': False
                })

            # handle subsequences
            subsequences = find_sequence(block)
            for sub in subsequences:
                sub_title = sub.find('p', class_ = 'courseblocktitle')
                if sub_title:
                    sub_strong = sub.find('p', class_='courseblocktitle')
                    if sub_strong:
                        sub_code = sub_strong.text.split('.')[0].strip().replace('\xa0', ' ')
                        sub_desc = sub.find('p', class_='courseblockdesc')
                        sub_desc = re.sub(r'^\n', '', sub_desc.text) if sub_desc else ""
                        sub_title_text = clean_title(sub_strong.text)
                        course_info.append({
                            'code': sub_code,
                            'title': sub_title_text,
                            'desc': sub_desc, # sub_desc.text if sub_desc else "",
                            'is_subsequence': True
                        })
        # if not subsequence, append course dict
        else:
            course_info.append({
                'code': code,
                'title': clean_title_text, 
                'desc': desc_text,
                'is_subsequence': False
            })

    return course_info

course_info = extract_course_info(links_lst, limiting_domain)
course_info

[{'code': 'ANTH 20100',
  'title': 'The Inka and Aztec States',
  'desc': 'This course is an intensive examination of the origins, structure, and meaning of two native states of the ancient Americas: the Inka and the Aztec. Lectures are framed around an examination of theories of state genesis, function, and transformation, with special reference to the economic, institutional, and symbolic bases of indigenous state development. This course is broadly comparative in perspective and considers the structural significance of institutional features that are either common to or unique expressions of these two Native American states.',
  'is_subsequence': False},
 {'code': 'ANTH 20405',
  'title': 'Anthropology of Disability',
  'desc': 'This seminar undertakes to explore "disability" from an anthropological perspective that recognizes it as a socially constructed concept with implications for our understanding of fundamental issues about culture, society, and individual differences. We expl

In [46]:
import json

def convert_json_to_dict(json_file):
   """
   Convert course_map.json to dictionary mapping course codes to IDs.
   """
   with open(json_file) as f:
       course_map = json.load(f)
   return course_map


course_map_dict = convert_json_to_dict('course_map.json')
print(course_map_dict['ANTH 20405']) 
print(course_map_dict)

1
{'HIPS 29900': 1446, 'MENG 20000': 1800, 'ASTR 20000': 153, 'BIOS 24232': 329, 'EEUR 21100': 2332, 'LING 27130': 1709, 'HMRT 21400': 1487, 'PERS 10103': 1948, 'HIST 29801': 1393, 'MATH 26200': 1768, 'HIST 21501': 1330, 'HIST 29649': 1387, 'HIPS 27013': 1432, 'ENGL 10200': 1007, 'CHDV 24402': 675, 'ANTH 28100': 93, 'ARTH 14107': 108, 'TLGU 10300': 2468, 'PLSC 25810': 2078, 'HUMA 14200': 1464, 'HIPS 21200': 1399, 'SOCI 20004': 1005, 'TAML 20200': 2526, 'PHYS 29100': 162, 'ANTH 22530': 33, 'HIST 29302': 1378, 'TAPS 28800': 2608, 'GEOG 28201': 1254, 'AKKD 10103': 1865, 'GEOS 22060': 1084, 'TAPS 28429': 2595, 'ENGL 24409': 1043, 'CMST 29700': 501, 'ENGL 20212': 1028, 'ARTH 29704': 147, 'SPAN 20500': 2269, 'BIOS 20197': 257, 'ENGL 25958': 1055, 'HIST 29648': 1386, 'HCHR 30200': 1787, 'PLSC 28800': 2088, 'PLSC 27315': 2082, 'CMST 29004': 500, 'HIPS 29810': 1445, 'PBPL 23600': 2141, 'JWSC 20004': 538, 'SOCI 20102': 2433, 'ENGL 32302': 1066, 'BASQ 12200': 1660, 'MUSI 25800': 1842, 'BIOS 29280

In [47]:
def create_unique_word_list(course_info, course_map_dict):
    '''
    Creates a dictionary where the keys are the unique words from the title 
    and description for each course, and the values are a list of course IDs
    where the word occured
    '''
    word_to_ids = {}

    for course in course_info:
        code = course['code']
        course_id = course_map_dict[code]

        title_words = re.findall(r'\b[a-zA-Z][a-zA-Z0-9_]*\b', course['title'].lower())
        desc_words = re.findall(r'\b[a-zA-Z][a-zA-Z0-9_]*\b', course['desc'].lower())

        all_words = set(title_words + desc_words) - INDEX_IGNORE

        for word in all_words:
            if word not in word_to_ids:
                word_to_ids[word] = []
            if course_id not in word_to_ids[word]:
                word_to_ids[word].append(course_id)
    
    return word_to_ids

     
new_unique_words = create_unique_word_list(course_info, course_map_dict)
new_unique_words

{'genesis': [0],
 'function': [0, 18, 92],
 'intensive': [0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 76],
 'perspective': [0, 1, 14, 64],
 'theories': [0, 11, 34, 36, 44, 46, 60, 62, 66, 83, 85],
 'significance': [0, 71],
 'meaning': [0, 8],
 'transformation': [0, 7, 12, 73, 95],
 'lectures': [0, 63, 64],
 'comparative': [0, 64, 89, 93, 95, 96],
 'features': [0, 40],
 'inka': [0],
 'framed': [0, 76],
 'ancient': [0, 13, 16, 64, 77, 78],
 'considers': [0, 3, 17, 39, 50],
 'structural': [0],
 'institutional': [0, 88],
 'broadly': [0, 38, 59, 88],
 'two': [0, 3, 4, 36, 59, 83],
 'americas': [0, 74],
 'common': [0],
 'expressions': [0, 99],
 'origins': [0, 3, 40, 70, 76, 82],
 'unique': [0, 2, 35, 59],
 'indigenous': [0, 5, 11, 33, 35, 36, 82],
 'aztec': [0, 40],
 'structure': [0, 11, 18, 81],
 'native': [0, 35, 36, 55, 56],
 'examination': [0, 100],
 'special': [0, 42, 54, 65, 89],
 'reference': [0],
 'around': [0, 35, 66, 77, 78, 81, 91],
 'states': [0, 3, 15, 37, 41, 54, 70, 75, 82, 90, 101],

In [48]:
def update_master_dict(master_dict, new_word_dict):
    for word, ids in new_word_dict.items():
        if word not in master_dict:
            master_dict[word] = []
        
        master_dict[word].extend([id for id in ids if id not in master_dict[word]])
    return master_dict

master_dict = {}
updated_master_dict = update_master_dict(master_dict, new_unique_words)
updated_master_dict

{'genesis': [0],
 'function': [0, 18, 92],
 'intensive': [0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 76],
 'perspective': [0, 1, 14, 64],
 'theories': [0, 11, 34, 36, 44, 46, 60, 62, 66, 83, 85],
 'significance': [0, 71],
 'meaning': [0, 8],
 'transformation': [0, 7, 12, 73, 95],
 'lectures': [0, 63, 64],
 'comparative': [0, 64, 89, 93, 95, 96],
 'features': [0, 40],
 'inka': [0],
 'framed': [0, 76],
 'ancient': [0, 13, 16, 64, 77, 78],
 'considers': [0, 3, 17, 39, 50],
 'structural': [0],
 'institutional': [0, 88],
 'broadly': [0, 38, 59, 88],
 'two': [0, 3, 4, 36, 59, 83],
 'americas': [0, 74],
 'common': [0],
 'expressions': [0, 99],
 'origins': [0, 3, 40, 70, 76, 82],
 'unique': [0, 2, 35, 59],
 'indigenous': [0, 5, 11, 33, 35, 36, 82],
 'aztec': [0, 40],
 'structure': [0, 11, 18, 81],
 'native': [0, 35, 36, 55, 56],
 'examination': [0, 100],
 'special': [0, 42, 54, 65, 89],
 'reference': [0],
 'around': [0, 35, 66, 77, 78, 81, 91],
 'states': [0, 3, 15, 37, 41, 54, 70, 75, 82, 90, 101],

In [49]:
def create_index(master_dict):
    word_with_codes = []
    for word, code_lst in master_dict.items():
        # print(word, code_lst)
        for code in code_lst:
            # print(code)
            word_with_code = f"{code}|{word}"
            # print(word_with_code)
            word_with_codes.append(word_with_code)
    return word_with_codes

updated_master_dict = update_master_dict(master_dict, new_unique_words)
final_index = create_index(updated_master_dict)
final_index

['0|genesis',
 '0|function',
 '18|function',
 '92|function',
 '0|intensive',
 '8|intensive',
 '9|intensive',
 '10|intensive',
 '11|intensive',
 '12|intensive',
 '13|intensive',
 '14|intensive',
 '15|intensive',
 '16|intensive',
 '76|intensive',
 '0|perspective',
 '1|perspective',
 '14|perspective',
 '64|perspective',
 '0|theories',
 '11|theories',
 '34|theories',
 '36|theories',
 '44|theories',
 '46|theories',
 '60|theories',
 '62|theories',
 '66|theories',
 '83|theories',
 '85|theories',
 '0|significance',
 '71|significance',
 '0|meaning',
 '8|meaning',
 '0|transformation',
 '7|transformation',
 '12|transformation',
 '73|transformation',
 '95|transformation',
 '0|lectures',
 '63|lectures',
 '64|lectures',
 '0|comparative',
 '64|comparative',
 '89|comparative',
 '93|comparative',
 '95|comparative',
 '96|comparative',
 '0|features',
 '40|features',
 '0|inka',
 '0|framed',
 '76|framed',
 '0|ancient',
 '13|ancient',
 '16|ancient',
 '64|ancient',
 '77|ancient',
 '78|ancient',
 '0|considers

In [50]:
def index_to_csv(final_index, csv_file):
    with open(csv_file, 'w', newline="") as f:
        writer = csv.writer(f, delimiter='|')

        for word, code_lst in final_index.items():
            for code in code_lst:
                writer.writerow([code, word])


index_to_csv(updated_master_dict, 'text_index.csv')
            


In [51]:
import queue
import json
import sys
import csv
import re
import bs4
from queue import Queue

# import util

INDEX_IGNORE = set(['a', 'also', 'an', 'and', 'are', 'as', 'at', 'be',
                    'but', 'by', 'course', 'for', 'from', 'how', 'i',
                    'ii', 'iii', 'in', 'include', 'is', 'not', 'of',
                    'on', 'or', 's', 'sequence', 'so', 'social', 'students',
                    'such', 'that', 'the', 'their', 'this', 'through', 'to',
                    'topics', 'units', 'we', 'were', 'which', 'will', 'with',
                    'yet'])


### YOUR FUNCTIONS HERE

def go(num_pages_to_crawl, course_map_filename, index_filename):
    '''
    Crawl the college catalog and generates a CSV file with an index.

    Inputs:
        num_pages_to_crawl: the number of pages to process during the crawl
        course_map_filename: the name of a JSON file that contains the mapping
          course codes to course identifiers
        index_filename: the name for the CSV of the index.

    Outputs:
        CSV file of the index index.
    '''

    starting_url = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
                    "/12200-1/new.collegecatalog.uchicago.edu/index.html")
    limiting_domain = "classes.cs.uchicago.edu"

    # Initialize
    url_queue = Queue()
    visited = set()
    master_dict = {}
    crawl_count = 0
    
    # Load course map
    course_map = convert_json_to_dict(course_map_filename)
    
    # Start crawl
    url_queue.put(starting_url)
    
    while not url_queue.empty() and crawl_count < num_pages_to_crawl:
        current_url = url_queue.get()
        
        if current_url not in visited:
            # Get page content
            html = pull_html(current_url, limiting_domain)
            if not html:
                continue
                
            # Extract course info and update index
            courses = extract_course_info(current_url, limiting_domain)
            new_words = create_unique_word_list(courses, course_map)
            update_master_dict(master_dict, new_words)
            
            # Queue new URLs
            new_links = pull_links_from_html(current_url, limiting_domain, html)
            queue_handler(url_queue, new_links, visited)
            
            visited.add(current_url)
            crawl_count += 1
    
    # Output index
    index_to_csv(master_dict, index_filename)

num_pages_to_crawl = 100
course_map_filename = 'course_map.json'
index_filename = 'text_index.csv'
go(num_pages_to_crawl, course_map_filename, index_filename)