# Web Scraping to complete the dataset

In [1]:
import re
import time
import pandas as pd
import seaborn as sns

Web Scraping packages:

In [2]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup as soup
from urllib.error import URLError

In [3]:
df_courses = pd.read_csv('../Data/interim/Courses.csv')

In [4]:
df_courses.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,WebDevelopment,4.0,2013-01-03,All Levels,True
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,2012-06-18T16:52:34Z,WebDevelopment,12.5,2012-06-18,All Levels,True
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,True,50.0,2577,529,64,All Levels,4.5 hours,2016-06-30T16:57:08Z,WebDevelopment,4.5,2016-06-30,All Levels,True
3,197836,Projects in HTML5,https://www.udemy.com/projects-in-html5/,True,60.0,8777,206,75,Intermediate Level,15.5 hours,2014-06-17T05:43:50Z,WebDevelopment,15.5,2014-06-17,Intermediate Level,True
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,True,20.0,23764,490,58,Beginner Level,5.5 hours,2015-10-17T04:52:25Z,WebDevelopment,5.5,2015-10-17,Beginner Level,True


In [5]:
global headers

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }

In [6]:
def reviewExtraction(soup):
    """
    Extraction of featured review using the BeautifulSoup object as input to find the text into the class called
    ud-component--clp--featured-review-content. Return the featured review of every course if it exists. Otherwise,
    return None.
    """
    try:
        review = soup.find('div', class_ ='ud-component--clp--featured-review-content')
        review = review.text
    except:
        review = None
        
    return review

In [7]:
def AverageRating(soup):
    """
    Use of soup object as input to extract the average rating of every course. Return the average score 
    as string if it exists. Otherwise, the method return None.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--clp--course-reviews-display')
        dist_rating_str = dist_rating['data-component-args']
        averageRating = re.findall(r'"averageRating":\d.\d+', dist_rating_str)
    except:
        averageRating = None
        
    return averageRating

In [8]:
def RequirementsExtraction(soup):
    """
    Extraction of requirements of every course, finding the tag <li> into the class requirements__item. Return 
    a string with the list of requirements. Otherwise, return None.
    """
    try:
        requirements = soup.find_all('li', class_ ='requirements__item')
        req = []
        for requirement in requirements:
            req.append(requirement.text)
        if len(req) == 0: #array is empty
            req = None
    except:
        req = None
        
    return req

In [9]:
def webScraping(url):
    """
    This method receives the url of the web page and it creates the BeautifulSoup object to find classes and 
    key tags to extract description, review, rating, ID and requirements of every course calling the methods above.
    """
    
    req = Request(url, headers=headers) 
    try:
        
        webpage = urlopen(req).read()
        soup_page = soup(webpage, 'html.parser')

        review = reviewExtraction(soup_page)
        avgRating = AverageRating(soup_page)
        requirements = RequirementsExtraction(soup_page)     
    
        return review, avgRating, requirements
        
    except HTTPError as err:
        
        print(url, err)
        review = None
        avgRating = None
        requirements = None 
        
        return review, avgRating, requirements
        
    except URLError:
        
        print(url, "Server down or incorrect domain")
        review = None
        avgRating = None
        requirements = None 
            
        return review, avgRating, requirements   

In [10]:
def chunker(seq, size):
    scraping_cols = []
    for pos in range(0, len(seq), size):
        df = seq[pos:pos + size]
        for url in df.url:
            scraping_cols.append(webScraping(url))
            time.sleep(15)
            
    unzip_scraping_cols = list(zip(*scraping_cols)) 
    time.sleep(10)
        
    return unzip_scraping_cols

In [11]:
df = df_courses.loc[1000: 2000, :]

In [12]:
df.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
1000,309820,ASP.Net MVC Quick Start,https://www.udemy.com/aspnet-mvc-quick-start/,False,0.0,15092,1234,65,Intermediate Level,5.5 hours,2016-08-23T15:59:10Z,WebDevelopment,5.5,2016-08-23,Intermediate Level,False
1001,1163894,Build a Chatbot integrated Website using Boots...,https://www.udemy.com/build-a-chatbot-integrat...,True,50.0,165,13,28,All Levels,3 hours,2017-04-28T16:54:20Z,WebDevelopment,3.0,2017-04-28,All Levels,True
1002,320798,How To Create A WordPress Website Without Payi...,https://www.udemy.com/wordpress-website-in-und...,True,20.0,4333,16,17,Beginner Level,1 hour,2014-10-20T11:53:12Z,WebDevelopment,1.0,2014-10-20,Beginner Level,True
1003,611688,PSD to Bootstrap 3 for Beginners using HTML & CSS,https://www.udemy.com/psd-to-bootstrap/,True,20.0,976,93,46,Beginner Level,2.5 hours,2015-11-03T21:03:34Z,WebDevelopment,2.5,2015-11-03,Beginner Level,True
1004,495958,Build Advanced Responsive Websites with Founda...,https://www.udemy.com/foundation-5-sass-learn-...,True,200.0,1276,245,271,All Levels,22.5 hours,2015-07-08T22:31:50Z,WebDevelopment,22.5,2015-07-08,All Levels,True


In [None]:
unzip_scraping_cols = chunker(df, 100)

In [25]:
df_courses.loc[1000:2000, 'review'] = unzip_scraping_cols[0]
df_courses.loc[1000:2000:, 'avgRating'] = unzip_scraping_cols[1]
df_courses.loc[1000:2000:, 'requirements'] = unzip_scraping_cols[2]