## Project Overview

Data scrapping/ Web Scarpping is a process of extracting information from a website. In this project, I have extracted courses details from edX portal.

## Importing Python Libraries

In [1]:
#importing neccesary python libraries
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

## Scarping the main course page - [edX](https://www.edx.org/search?tab=course)

In [2]:
driver = webdriver.Chrome('chromedriver.exe')
#opening edx course url
driver.get('https://www.edx.org/search?tab=course')

In [3]:
#calculating number of pages 
page_str = driver.find_element_by_xpath('(//button[@class="btn page-link"])[last()]').text
total_page = int(page_str)
print(f'There are {total_page} pages of courses available in edX')

There are 42 pages of courses available in edX


In [4]:
course_links = []
page = 1
#looping through the pages to get course links
while page <= total_page:  
    #Xpath of the courses
    c_xpath = '//div[@class="discovery-card Verified and Audit col col-xl-3 mb-4 scrollable-discovery-card-spacing"]/a[@class="discovery-card-link"]'
    #get a list of all courses available in the current page
    courses = driver.find_elements_by_xpath(c_xpath)
    #extract the links of the courses
    for course in courses:
        course_links.append(course.get_attribute('href'))
    #Check whether current page is last page or not
    if page != total_page:
        #click the next page button
        driver.find_element_by_xpath('//button[@class="btn next page-link"]').click()
        page += 1
        #wait until program moves to next page
        check = '//button[@aria-label="Page ' + str(page) + ', Current Page"]'
        WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, c_xpath)))
        #wait for another 30 seconds for the page to load completely
        time.sleep(30)
    else:
        break

#close the driver
driver.close()   

print(f'There are {len(course_links)} courses available')

There are 998 courses available


## Scrapping the individual course pages

In [5]:
#writing functions to extract and return required value from each edX course page
#these functions will return 'Missing' if certain fields are not found in course page

def get_title():
    try:
        title = driver.find_element_by_xpath('//h1[@class="course-intro-heading mb-2"]').text
    except:
        title = 'Missing'
    finally:
        return title
    
def get_short_description():
    try:
        des = driver.find_element_by_xpath('//div[@class="course-intro-lead-in mb-3"]/p').text
    except:
        des = 'Missing'
    finally:
        return des
    
def get_length():
    try:
        length = driver.find_element_by_xpath('(//li[@class="list-group-item d-flex row px-0"])[1]')
        length = length.find_element_by_xpath('./div[@class="col"]').text
    except:
        length = 'Missing'
    finally:
        return length
    
def get_effort():
    try:
        effort = driver.find_element_by_xpath('(//li[@class="list-group-item d-flex row px-0"])[2]')
        effort = effort.find_element_by_xpath('./div[@class="col"]').text
    except:
        effort = 'Missing'
    finally:
        return effort

def get_price():
    try:
        price = driver.find_element_by_xpath('(//li[@class="list-group-item d-flex row px-0"])[3]')
        price = price.find_element_by_xpath('./div[@class="col"]').text
        #extract only the value starting with $ or ₹
        price = re.findall(r'[\$\₹].*', price)[0]
    except:
        price = 'Missing'
    finally:
        return price

def get_institution():
    try:
        institution = driver.find_element_by_xpath('(//li[@class="list-group-item d-flex row px-0"])[4]')
        institution = institution.find_element_by_xpath('./div[@class="col"]').text
    except:
        institution = 'Missing'
    finally:
        return institution

def get_subject():
    try:
        subject = driver.find_element_by_xpath('(//li[@class="list-group-item d-flex row px-0"])[5]')
        subject = subject.find_element_by_xpath('./div[@class="col"]').text
    except:
        subject = 'Missing'
    finally:
        return subject

def get_level():
    try:
        level = driver.find_element_by_xpath('(//li[@class="list-group-item d-flex row px-0"])[6]')
        level = level.find_element_by_xpath('./div[@class="col"]').text
    except:
        level = 'Missing'
    finally:
        return level

def get_prerequisites():
    try:
        prerequisites = driver.find_element_by_xpath('//div[@class="col prerequisite-sidebar"]//p').text
    except:
        prerequisites = 'Missing'
    finally:
        return prerequisites

In [6]:
#create a pandas DataFrame to store the extracted info
course_details = pd.DataFrame(columns=['Course Link','Title', 'Short Description', 
                 'Length', 'Effort', 'Price', 'Institution', 
                 'Subject', 'Level', 'Prerequisites'])

In [7]:
#loop through course link and fetch required fields
for course_link in course_links:
    course_dict = {}
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get(course_link)
    
    #extract information using functions and storing it in temp dict    
    course_dict['Course Link'] = course_link
    course_dict['Title'] = get_title()
    course_dict['Short Description'] = get_short_description()
    course_dict['Length'] = get_length()
    course_dict['Effort'] = get_effort()
    course_dict['Price'] = get_price()
    course_dict['Institution'] = get_institution()
    course_dict['Subject'] = get_subject()
    course_dict['Level'] = get_level()
    course_dict['Prerequisites'] = get_prerequisites()
    driver.close()
    
    #append the extracted info to the DataFrame
    course_details = course_details.append(course_dict,ignore_index=True)

In [8]:
#view header of the DataFrame
course_details.head()

Unnamed: 0,Course Link,Title,Short Description,Length,Effort,Price,Institution,Subject,Level,Prerequisites
0,https://www.edx.org/course/cs50s-introduction-...,CS50's Introduction to Computer Science,An introduction to the intellectual enterprise...,12 Weeks,6–18 hours per week,$90 USD,HarvardX,Computer Science,Introductory,None.
1,https://www.edx.org/course/cs50s-web-programmi...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing
2,https://www.edx.org/course/entrepreneurship-in...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing
3,https://www.edx.org/course/cs50s-introduction-...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing
4,https://www.edx.org/course/rhetoric-art-of-per...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing


> Data of few courses are missed due to network issue. Let's seperate them and extract there details

## Handling missed links

In [12]:
#seperating missed links
missed_links = list(course_details[course_details['Title'] == 'Missing']['Course Link'])
print(f'{len(missed_links)} courses are missed')

35 courses are missed


In [13]:
#dropping the missed courses from DataFrame
course_details = course_details[course_details.Title != 'Missing']

In [14]:
#loop through missed course link and fetch required fields
for course_link in missed_links:
    course_dict = {}
    driver = webdriver.Chrome('chromedriver.exe')
    driver.get(course_link)
    
    #extract information using functions and storing it in temp dict    
    course_dict['Course Link'] = course_link
    course_dict['Title'] = get_title()
    course_dict['Short Description'] = get_short_description()
    course_dict['Length'] = get_length()
    course_dict['Effort'] = get_effort()
    course_dict['Price'] = get_price()
    course_dict['Institution'] = get_institution()
    course_dict['Subject'] = get_subject()
    course_dict['Level'] = get_level()
    course_dict['Prerequisites'] = get_prerequisites()
    driver.close()
    
    #append the extracted info to the DataFrame
    course_details = course_details.append(course_dict,ignore_index=True)

## Writing the data to CSV file

In [15]:
#checking for missed links
(course_details['Title'] == 'Missing').any()

False

In [18]:
course_details.head()

Unnamed: 0,Course Link,Title,Short Description,Length,Effort,Price,Institution,Subject,Level,Prerequisites
0,https://www.edx.org/course/cs50s-introduction-...,CS50's Introduction to Computer Science,An introduction to the intellectual enterprise...,12 Weeks,6–18 hours per week,$90 USD,HarvardX,Computer Science,Introductory,None.
1,https://www.edx.org/course/fundamentals-of-neu...,"Fundamentals of Neuroscience, Part 1: The Elec...",Learn how electricity makes the neurons in you...,5 Weeks,3–5 hours per week,$99 USD,HarvardX,Biology & Life Sciences,Introductory,"Some familiarity witharithmetic, basic algebra..."
2,https://www.edx.org/course/data-science-machin...,Data Science: Machine Learning,Build a movie recommendation system and learn ...,8 Weeks,2–4 hours per week,$49 USD,HarvardX,Data Analysis & Statistics,Introductory,This course is part of our Professional Certif...
3,https://www.edx.org/course/justice-2,Justice,This introduction to moral and political philo...,12 Weeks,3–6 hours per week,$99 USD,HarvardX,Humanities,Introductory,Missing
4,https://www.edx.org/course/child-protection-ch...,Child Protection: Children's Rights in Theory ...,"Learn how to protect children from violence, e...",16 Weeks,2–6 hours per week,$99 USD,HarvardX,Social Sciences,Introductory,Missing


Coverting the special characters(–) to csv accessible format

In [21]:
course_details['Effort'] = course_details['Effort'].str.replace('–','-')

In [23]:
#writing the extracted data to CSV file
course_details.to_csv('Data/edX_Course.csv',index=False)