<h1><center>edX Courses Scraping - Notebook 2</center></h1>

Only around 1000 courses are found on the course page of the edX website. To scrap all courses, we should follow an alternative approach. Hence, I'm going to scrap courses subject wise.

## Importing Python Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pickle

## Scarping the subject page - [edX]('https://www.edx.org/subjects')

In [2]:
driver = webdriver.Chrome('chromedriver.exe')
#opening edx subject url
driver.get('https://www.edx.org/subjects')

In [3]:
#extracting the links of the individual subject pages
all_sub = driver.find_elements_by_xpath('(//div[@class="col-12 mb-4"])[last()]//li[@class="subject-card mb-3"]/a')
all_sub_links = {}
for sub in all_sub:
    subject = sub.text
    links = sub.get_attribute('href')
    all_sub_links[subject] = links
#close the driver   
driver.close() 
#print number of courses available
print(f'There are {len(all_sub)} subjects found')

There are 31 subjects found


In [4]:
#there are two types of subject pages available in edx page
#writing a function to call respective page for each page type
def load_and_fetch_courses(link):
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get(link)
    
    course_found = len(driver.find_elements_by_xpath('//div[@class="lazyload-wrapper"]'))
    driver.close()
    if course_found :
        return fetch_courses_1(link,course_found)
    else:
        return fetch_courses_2(link)

#writing a function to extract courses links from type 1 page    
def fetch_courses_1(link,course_found):
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get(link)
    
    time_scroll = course_found//10
    #scroll down untill all courses get loaded
    scroll = 500
    for _ in range(time_scroll):
        time.sleep(20)
        scroll += 500
        if _ == (time_scroll - 1):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        else:
            driver.execute_script("window.scrollTo(0," + str(scroll) + ");")
            
    #wait another 20 seconds for the last courses to get loaded
    time.sleep(20)   
    #get a list of all courses available in the current page
    courses_driver = driver.find_elements_by_xpath('(//div[@class="row card-columns mt-4 pt-4 discovery-card-list"])[last()]//div/a')
    #extract the links of the courses
    course_links = []
    for courses in courses_driver:
        course_links.append(courses.get_attribute('href'))
    
    #close the driver   
    driver.close()
    #return the course links
    return course_links

#writing a function to extract courses links from type 2 page  
def fetch_courses_2(link):
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get(link)
    
    #finding the main course link and loading that
    main_link = driver.find_element_by_xpath('//a[@class="link d-sm-flex d-block"]').get_attribute('href')
    driver.get(main_link)
    time.sleep(20)
    #clicking the course tab of the loaded page
    driver.find_element_by_xpath('(//button[@class="show-all-link btn btn-link link d-inline-block px-0"])[last()]').click()
    time.sleep(20)
    #calculating number of pages 
    page_str = driver.find_element_by_xpath('(//button[@class="btn page-link"])[last()]').text
    total_page = int(page_str)
    
    course_links = []
    page = 1
    #looping through the pages to get course links
    while page <= total_page:  
        #Xpath of the courses
        c_xpath = '//div[@class="discovery-card Verified and Audit col col-xl-3 mb-4 scrollable-discovery-card-spacing"]/a[@class="discovery-card-link"]'
        #get a list of all courses available in the current page
        courses = driver.find_elements_by_xpath(c_xpath)
        #extract the links of the courses
        for course in courses:
            course_links.append(course.get_attribute('href'))
        #Check whether current page is last page or not
        if page != total_page:
            #click the next page button
            driver.find_element_by_xpath('//button[@class="btn next page-link"]').click()
            page += 1
            #wait until program moves to next page
            check = '//button[@aria-label="Page ' + str(page) + ', Current Page"]'
            WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, c_xpath)))
            #wait for another 30 seconds for the page to load completely
            time.sleep(30)
        else:
            break

    #close the driver
    driver.close()
    #return the course links
    return course_links

In [5]:
#fetching the course link for each subject and adding it to the dictonary
for sub in all_sub_links:
    each_sub_courses = load_and_fetch_courses(all_sub_links[sub])
    all_sub_links[sub] = [all_sub_links[sub],each_sub_courses]

In [6]:
#calculating total number of course links extracted
all_courses = []
for sub in all_sub_links:
    all_courses.extend(all_sub_links[sub][1])
#set stores only unique values
all_courses = set(all_courses)
print(f'There are {len(all_courses)} courses across all subjects in edX')

There are 2831 courses across all subjects in edX


> Since Philanthropy subject has different format, for loop got broke. Lets seperate it and extract other subjects

## Handling missed subjects

In [9]:
missed_subjects = []
#extract the subjects that didn't get loaded
for sub in all_sub_links:
    if len(all_sub_links[sub]) !=2 :
        missed_subjects.append(sub) 
#removing Philanthropy        
missed_subjects.remove('Philanthropy')
print(f'Missed subjects are {missed_subjects}')

Missed subjects are ['Philosophy & Ethics', 'Physics', 'Science', 'Social Sciences']


In [10]:
#fetching the course link for missed subjects and adding it to the dictonary
for sub in missed_subjects:
    each_sub_courses = load_and_fetch_courses(all_sub_links[sub])
    all_sub_links[sub] = [all_sub_links[sub],each_sub_courses]

In [11]:
#fetching the course link of Philanthropy 
sub_courses = fetch_courses_1(all_sub_links['Philanthropy'],0)
all_sub_links['Philanthropy'] = [all_sub_links['Philanthropy'],sub_courses]

In [12]:
#calculating total number of course links extracted
all_courses = []
for sub in all_sub_links:
    all_courses.extend(all_sub_links[sub][1])
#set stores only unique values
all_courses = set(all_courses)
print(f'There are {len(all_courses)} courses across all subjects in edX')

There are 2987 courses across all subjects in edX


In [13]:
#storing the extracted links as pickle, so that it can be used in next notebooks for further analysis
with open('Data/all_sub_links.pkl','wb') as file:
    pickle.dump(all_sub_links,file)