In [None]:
# Note: Use standard account rather than admin account

In [None]:
import os
import re
import pandas as pd
from selenium import webdriver

In [None]:
### PARAMETERS ###

# Number of pages in course list
# Check query string of last page e.g. /application/en/courses-solr?page=39 and add 1 e.g. LAST_PAGE_EN = 40
LAST_PAGE_EN = 40
LAST_PAGE_FR = 40

# Bool for scraping FR descriptions
FRENCH = True

In [None]:
### CREDENTIALS ###
USERNAME = os.environ.get('GCCAMPUS_USERNAME')
PASSWORD = os.environ.get('GCCAMPUS_PASSWORD')
assert USERNAME is not None, 'Missing USERNAME'
assert PASSWORD is not None, 'Missing PASSWORD'

In [None]:
browser = webdriver.Chrome()

In [None]:
# Navigate to GCcampus and login
if FRENCH:
    main_url = 'https://idp.csps-efpc.gc.ca/idp/login-fr.jsp'
else:
    main_url = 'https://idp.csps-efpc.gc.ca/idp/Authn/UserPassword'

browser.get(main_url)
browser.find_element_by_id('j_username').send_keys(USERNAME)
browser.find_element_by_id('j_password').send_keys(PASSWORD)
browser.find_element_by_id('cbPrivacy').click()
browser.find_element_by_xpath("//button[@type='submit']").click()

In [None]:
# Loop through course list and get all links
if FRENCH:
    list_url = 'https://learn-apprendre.csps-efpc.gc.ca/application/fr/courses-solr?page='
else:
    list_url = 'https://learn-apprendre.csps-efpc.gc.ca/application/en/courses-solr?page='
last_page = LAST_PAGE_FR if FRENCH else LAST_PAGE_EN
course_links = []
for i in range(last_page):
    browser.get(list_url + str(i))
    mars = browser.find_elements_by_css_selector('.field-items a')
    for elem in mars:
        course_links.append(elem.get_attribute('href'))

In [None]:
# Compile regex to extract course codes
# Optional capture group 1: e.g. 'C451-2'
# Optional capture group 2: e.g. 'G110 – MODULE 3'
regex = re.compile(pattern=r'[a-zA-Z]{1}\d{3}(?:[–-]{1}\d{1})?(?:\s{1}[–-]{1}\s{1}MODULE\s{1}\d{1})?')

In [None]:
# Dict to map problematic / exceptional course codes
EXCEPTION_DICT = {
    'H200': 'H200_MODULE 1'
}

In [None]:
# For each link in 'course_links', navigate to page, grab course description (HTML
# tags included), search for course code, and save to 'desc_dict'
desc_dict = {}
for i, link in enumerate(course_links):
    print('{0} / {1}'.format(i + 1, len(course_links)))
    browser.get(link)
    # Grab description
    desc = browser.find_elements_by_css_selector('.field-item[property="content:encoded"]')[0].get_attribute('innerHTML')
    # Grab title and extract course code
    title = browser.find_elements_by_css_selector('.page-title')[0].get_attribute('innerHTML')
    title = title.upper()
    title_search = regex.findall(title)
    pkey = title_search[0] if title_search else link
    pkey = pkey.replace('–', '-')
    if pkey in EXCEPTION_DICT:
        pkey = EXCEPTION_DICT[pkey]
    desc_dict[pkey] = desc

In [None]:
# Store 'desc_dict' in DataFrame for processing
df = pd.DataFrame.from_dict(desc_dict, orient='index')
df.reset_index(level=0, inplace=True)
df.columns = ['course_code', 'desc']

In [None]:
# Transform relative links to absolute links
df['desc'] = df['desc'].astype(str).str.replace('href="/application/en/',
                                                'href="https://learn-apprendre.csps-efpc.gc.ca/application/en/',
                                                regex=False)
df['desc'] = df['desc'].astype(str).str.replace('href="/application/fr/',
                                                'href="https://learn-apprendre.csps-efpc.gc.ca/application/fr/',
                                                regex=False)

In [None]:
# Remove junk info
df['desc'] = df['desc'].astype(str).str.replace(' This link will open in a new window', ' ', regex=False)
df['desc'] = df['desc'].astype(str).str.replace(' Ce lien va ouvrir dans une nouvelle fenêtre', ' ', regex=False)

In [None]:
# Replace new line characters '\r' and '\n' with a space
df['desc'] = df['desc'].astype(str).str.replace('\r', ' ', regex=False)
df['desc'] = df['desc'].astype(str).str.replace('\n', ' ', regex=False)

In [None]:
# Remove superfluous spacing
df['desc'] = df['desc'].astype(str).str.replace(r'  +', ' ', regex=True)

In [None]:
# Export to CSV
df.to_csv('scraped_{0}.csv'.format('fr' if FRENCH else 'en'), sep=',', encoding='utf-8')