# Web Scraping to complete the dataset

In [1]:
import re
import time
import pandas as pd
import seaborn as sns

Web Scraping packages:

In [2]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup as soup
from urllib.error import URLError

In [3]:
df_courses = pd.read_csv('../Data/interim/Courses.csv')

In [4]:
df_courses.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,WebDevelopment,4.0,2013-01-03,All Levels,True
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,2012-06-18T16:52:34Z,WebDevelopment,12.5,2012-06-18,All Levels,True
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,True,50.0,2577,529,64,All Levels,4.5 hours,2016-06-30T16:57:08Z,WebDevelopment,4.5,2016-06-30,All Levels,True
3,197836,Projects in HTML5,https://www.udemy.com/projects-in-html5/,True,60.0,8777,206,75,Intermediate Level,15.5 hours,2014-06-17T05:43:50Z,WebDevelopment,15.5,2014-06-17,Intermediate Level,True
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,True,20.0,23764,490,58,Beginner Level,5.5 hours,2015-10-17T04:52:25Z,WebDevelopment,5.5,2015-10-17,Beginner Level,True


In [5]:
global headers

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }

In [6]:
def description(soup):
    """
    Extraction of featured review using the BeautifulSoup object as input to find the text into the class called
    ud-component--clp--featured-review-content. Return the featured review of every course if it exists. Otherwise,
    return None.
    """
    try:
        description = soup.find('div', class_ ='show-more--content--isg5c show-more--with-gradient--2abmN')
        description = description.text
    except:
        description = None
        
    return description

In [7]:
def AverageRating(soup):
    """
    Use of soup object as input to extract the average rating of every course. Return the average score 
    as string if it exists. Otherwise, the method return None.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--course-landing-page-udlite--reviews')
        dist_rating_str = dist_rating['data-component-args']
        averageRating = re.findall(r'"averageRating":\d.\d+', dist_rating_str)
    except:
        averageRating = None
        
    return averageRating

In [8]:
def RequirementsExtraction(soup):
    """
    Extraction of requirements of every course, finding the tag <li> into the class requirements__item. Return 
    a string with the list of requirements. Otherwise, return None.
    """
    try:
        requirements = soup.find('div', class_ ='ud-component--course-landing-page-udlite--requirements')
        req = requirements.text
    except:
        req = None
        
    return req

In [9]:
def RatingExtraction(soup):
    """
    Extraction of distribution of rating, it means, a tuple with ratings and number of counts. Return None 
    if the distribution doesn't exist.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--course-landing-page-udlite--reviews')
        dist_rating_str = dist_rating['data-component-args']
        counts = re.findall(r'"count":\d+', dist_rating_str)
        rating = re.findall(r'"rating":\d+', dist_rating_str)
        count, rank = [], []
        for c in counts:
            count.append(c[8:])

        for rt in rating:
            rank.append(rt[9:])

        rating_dist = set(zip(count, rank))
    except:
        rating_dist = None
    
    return rating_dist

In [10]:
def webScraping(url):
    """
    This method receives the url of the web page and it creates the BeautifulSoup object to find classes and 
    key tags to extract description, review, rating, ID and requirements of every course calling the methods above.
    """
    
    req = Request(url, headers=headers) 
    try:
        
        webpage = urlopen(req).read()
        soup_page = soup(webpage, 'html.parser')

        descript = description(soup_page)
        avgRating = AverageRating(soup_page)
        requirements = RequirementsExtraction(soup_page) 
        rating_dist = RatingExtraction(soup_page)
    
        return descript, avgRating, requirements, rating_dist
    except:
        return None, None, None, None

Test with a sample:

In [11]:
url = df_courses.url[2]
webScraping(url)

("Do you want to build a chatbot, so a bot that can talk? Yes, a bot that can talk to your friends or customers or fans while you sleep or do something else. You can make one for your customer that keep on asking the same questions. Or if you have a community for your fans and followers that want to know your details. Use your imagination, any time you have to reply the same thing over and over again, someone else like a bot can do it for you.In the first part of the course, we'll make a chatbot without programming skills. We'll build a ChatBot that can answer frequently asked questions, and I'll show you how to teach your bot to have any other dialogs. We'll learn this by teaching our ChatBot to make job interviews.We'll use DialogFlow to process natural language. DialogFlow will help the bot to understand what users want.The chatbot will communicate to its customers via Facebook Messenger.And in the second part, we'll use NodeJS to upgrade the bot. So the basic knowledge of javascrip

In [12]:
req = Request(url, headers=headers) 
webpage = urlopen(req).read()
page_soup = soup(webpage, 'html.parser')

In [13]:
assert page_soup is not None, "Soup is empty"

In [14]:
avgRating = AverageRating(page_soup)
avgRating

['"averageRating":4.44057']

In [15]:
rating_dist = RatingExtraction(page_soup)
rating_dist

{('1305', '5'), ('263', '3'), ('35', '1'), ('41', '2'), ('704', '4')}

In [16]:
descript = description(page_soup)
descript

"Do you want to build a chatbot, so a bot that can talk? Yes, a bot that can talk to your friends or customers or fans while you sleep or do something else. You can make one for your customer that keep on asking the same questions. Or if you have a community for your fans and followers that want to know your details. Use your imagination, any time you have to reply the same thing over and over again, someone else like a bot can do it for you.In the first part of the course, we'll make a chatbot without programming skills. We'll build a ChatBot that can answer frequently asked questions, and I'll show you how to teach your bot to have any other dialogs. We'll learn this by teaching our ChatBot to make job interviews.We'll use DialogFlow to process natural language. DialogFlow will help the bot to understand what users want.The chatbot will communicate to its customers via Facebook Messenger.And in the second part, we'll use NodeJS to upgrade the bot. So the basic knowledge of javascript

In [17]:
requirements = RequirementsExtraction(page_soup) 
requirements

'Requirementsyou need to know what chatbots arefor the second part of the course you need to know the basics of JavaScript and Node.js'

In [43]:
# def chunker(seq, size):
#     scraping_cols = []
#     for pos in range(0, len(seq), size):
#         df = seq[pos:pos + size]
#         for url in df.url:
#             scraping_cols.append(webScraping(url))
#             time.sleep(15)
            
#     unzip_scraping_cols = list(zip(*scraping_cols)) 
#     time.sleep(10)
        
#     return unzip_scraping_cols

# unzip_scraping_cols = chunker(df, 100)
# df_courses.loc[1000:2000, 'review'] = unzip_scraping_cols[0]
# df_courses.loc[1000:2000:, 'avgRating'] = unzip_scraping_cols[1]
# df_courses.loc[1000:2000:, 'requirements'] = unzip_scraping_cols[2]

Selecting 50 samples of every category:

In [21]:
df_samples = df_courses.groupby('category').apply(lambda x: x.sample(n=50))
df_samples.reset_index(drop=True, inplace=True)

In [22]:
df_chunk = df_samples.loc[0: 99]

In [None]:
scraping_cols = []

for url in df_chunk.url:
    scraping_cols.append(webScraping(url))
    time.sleep(15)

In [None]:
unzip_scraping_cols = list(zip(*scraping_cols)) 
df_courses.loc[a:b, 'description'] = unzip_scraping_cols[0]
df_courses.loc[a:b:, 'avgRating'] = unzip_scraping_cols[1]
df_courses.loc[a:b:, 'requirements'] = unzip_scraping_cols[2]
df_courses.loc[a:b:, 'rating_distribution'] = unzip_scraping_cols[3]