# Web Scraping to complete the dataset

In [1]:
import re
import time
import pandas as pd
import seaborn as sns
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup

In [2]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

In [3]:
df_courses = pd.read_csv('../Data/interim/Courses.csv')

In [4]:
df_courses.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,WebDevelopment,4.0,2013-01-03,All Levels,True
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,2012-06-18T16:52:34Z,WebDevelopment,12.5,2012-06-18,All Levels,True
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,True,50.0,2577,529,64,All Levels,4.5 hours,2016-06-30T16:57:08Z,WebDevelopment,4.5,2016-06-30,All Levels,True
3,197836,Projects in HTML5,https://www.udemy.com/projects-in-html5/,True,60.0,8777,206,75,Intermediate Level,15.5 hours,2014-06-17T05:43:50Z,WebDevelopment,15.5,2014-06-17,Intermediate Level,True
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,True,20.0,23764,490,58,Beginner Level,5.5 hours,2015-10-17T04:52:25Z,WebDevelopment,5.5,2015-10-17,Beginner Level,True


In [5]:
global headers

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }

In [7]:
req = Request(df_courses.url[0], headers=headers) 
webpage = urlopen(req).read()

In [8]:
page_soup = soup(webpage, 'html.parser')

In [49]:
# page_soup

# Finding relevant features 

Using this `page_soup` we proceed to extract some features before to create the functions that will replicate the same for all the courses in the catalog of the dataset.

### Featured Review

In [10]:
review = page_soup.find('div', class_ ='ud-component--clp--featured-review-content')
review

<div class="ud-component--clp--featured-review-content" data-component-args='{"courseId":28295,"reviewId":14453048,"reviewContent":"Great course on learning how to code a website from a graphic mock-up, it was cool to learn how to layout sections in CSS with some nice little touches for making elements display better on screen etc, definitely worth checking out if you are new to web coding &amp; want to get up to speed with creating a site"}'>
Great course on learning how to code a website from a graphic mock-up, it was cool to learn how to layout sections in CSS with some nice little touches for making elements display better on screen etc, definitely worth checking out if you are new to web coding &amp; want to get up to speed with creating a site
</div>

### Ranking Distribution 

In [12]:
dist_rating = page_soup.find('div', class_ ='ud-component--clp--course-reviews-display')
dist_rating

<div class="ud-component--clp--course-reviews-display" data-component-args='{"showUserAvatar":true,"averageRating":4.5292,"courseId":28295,"isFreeSEOExp":false,"page":"clp","ratingDistribution":[{"rating":1,"count":19},{"rating":2,"count":28},{"rating":3,"count":110},{"rating":4,"count":272},{"rating":5,"count":490}],"topReviewAttributes":[]}'>
</div>

In [13]:
dist_rating_str = dist_rating['data-component-args']
dist_rating_str

'{"showUserAvatar":true,"averageRating":4.5292,"courseId":28295,"isFreeSEOExp":false,"page":"clp","ratingDistribution":[{"rating":1,"count":19},{"rating":2,"count":28},{"rating":3,"count":110},{"rating":4,"count":272},{"rating":5,"count":490}],"topReviewAttributes":[]}'

In [14]:
counts = re.findall(r'"count":\d+', dist_rating_str)
rating = re.findall(r'"rating":\d+', dist_rating_str)

In [15]:
count, rank = [], []
for c in counts:
    count.append(c[8:])
    
for rt in rating:
    rank.append(c[9:])
    
rating_dist = set(zip(count, rank))
print(rating_dist)

{('28', '90'), ('490', '90'), ('110', '90'), ('272', '90'), ('19', '90')}


### CourseID

In [16]:
courseId = re.findall(r'"courseId":\d+', dist_rating_str)
print(courseId)

['"courseId":28295']


### Average Rating

In [17]:
averageRating = re.findall(r'"averageRating":\d.\d+', dist_rating_str)
print(averageRating)

['"averageRating":4.5292']


### Requirements

In [18]:
requirements = page_soup.find_all('li', class_ ='requirements__item')
for requirement in requirements:
    print(requirement.text)

Adobe Photoshop CS3 or higher
Very basic knowledge of HTML & CSS
Text Editor (TextWranger, Espresso, or Coda recommended)


## Automatization of web scraping

In [20]:
def reviewExtraction(soup):
    """
    Extraction of featured review using the BeautifulSoup object as input to find the text into the class called
    ud-component--clp--featured-review-content. Return the featured review of every course if it exists. Otherwise,
    return None.
    """
    try:
        review = soup.find('div', class_ ='ud-component--clp--featured-review-content')
        review = review.text
    except:
        review = None
        
    return review

In [21]:
def RatingExtraction(soup):
    """
    Extraction of distribution of rating, it means, a tuple with ratings and number of counts. Return None 
    if the distribution doesn't exist.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--clp--course-reviews-display')
        dist_rating_str = dist_rating['data-component-args']
        counts = re.findall(r'"count":\d+', dist_rating_str)
        rating = re.findall(r'"rating":\d+', dist_rating_str)
        count, rank = [], []
        for c in counts:
            count.append(c[8:])

        for rt in rating:
            rank.append(c[9:])

        rating_dist = set(zip(count, rank))
    except:
        rating_dist = None
    
    return rating_dist

In [22]:
def AverageRating(soup):
    """
    Use of soup object as input to extract the average rating of every course. Return the average score 
    as string if it exists. Otherwise, the method return None.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--clp--course-reviews-display')
        dist_rating_str = dist_rating['data-component-args']
        averageRating = re.findall(r'"averageRating":\d.\d+', dist_rating_str)
    except:
        averageRating = None
        
    return averageRating

In [None]:
ud-component--clp--course-reviews-display

In [23]:
def CourseId(soup):
    """
    Use of BeatifulSoup object as input to extract the course ID. Return ID as string if it exists; 
    otherwise, return None.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--clp--course-reviews-display')
        dist_rating_str = dist_rating['data-component-args']
        courseId = re.findall(r'"courseId":\d+', dist_rating_str)
    except:
        courseId = None
        
    return courseId

In [24]:
def RequirementsExtraction(soup):
    """
    Extraction of requirements of every course, finding the tag <li> into the class requirements__item. Return 
    a string with the list of requirements. Otherwise, return None.
    """
    try:
        requirements = soup.find_all('li', class_ ='requirements__item')
        req = []
        for requirement in requirements:
            req.append(requirement.text)
    except:
        req = None
        
    return req

In [82]:
def webScraping(url):
    """
    This method receives the url of the web page and it creates the BeautifulSoup object to find classes and 
    key tags to extract description, review, rating, ID and requirements of every course calling the methods above.
    """
    
    req = Request(url, headers=headers) 
    try:
        webpage = urlopen(req).read()
    except HTTPError as err:
        if err.code == 404:
            review = None
            rating = None
            avgRating = None
            courseId = None
            requirements = None 
            
            return review, rating, avgRating, courseId, requirements
        
        else:
            raise
            
    soup_page = soup(webpage, 'html.parser')

    review = reviewExtraction(soup_page)
    rating = RatingExtraction(soup_page)
    avgRating = AverageRating(soup_page)
    courseId = CourseId(soup_page)
    requirements = RequirementsExtraction(soup_page)     
    
    return review, rating, avgRating, courseId, requirements

In [26]:
df_chunk1 =  df_courses.loc[0:114, :]

In [48]:
# scraping_cols = []

# for url in df_chunk1.url:
#     scraping_cols.append(webScraping(url))
#     time.sleep(15)

In [30]:
unzip_scraping_cols = list(zip(*scraping_cols)) 

In [44]:
df_chunk1.loc[0:114, 'review'] = unzip_scraping_cols[0]
df_chunk1.loc[0:114, 'distRating'] = unzip_scraping_cols[1]
df_chunk1.loc[0:114, 'avgRating'] = unzip_scraping_cols[2]
df_chunk1.loc[0:114, 'courseId'] = unzip_scraping_cols[3]
df_chunk1.loc[0:114, 'requirements'] = unzip_scraping_cols[4]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [50]:
df_chunk1.head(2)

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,...,category,timeSpent,publishDate,level,paidBool,review,distRating,avgRating,courseId,requirements
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,...,WebDevelopment,4.0,2013-01-03,All Levels,True,\nGreat course on learning how to code a websi...,"{(28, 90), (490, 90), (110, 90), (272, 90), (1...","[""averageRating"":4.5292]","[""courseId"":28295]","[Adobe Photoshop CS3 or higher, Very basic kno..."
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,...,WebDevelopment,12.5,2012-06-18,All Levels,True,,,,,[]


In [47]:
#df_chunk1.loc[0:114, :].to_csv('../Data/interim/df_chunk.csv', index=False)

In [51]:
df_chunk2 =  df_courses.loc[115:220, :]

In [55]:
# scraping_cols1 = []

# for url in df_chunk2.url:
#     scraping_cols1.append(webScraping(url))
#     time.sleep(15)

In [57]:
unzip_scraping_cols1 = list(zip(*scraping_cols1)) 

In [59]:
df_chunk2.loc[115:220, 'review'] = unzip_scraping_cols1[0]
df_chunk2.loc[115:220, 'distRating'] = unzip_scraping_cols1[1]
df_chunk2.loc[115:220, 'avgRating'] = unzip_scraping_cols1[2]
df_chunk2.loc[115:220, 'courseId'] = unzip_scraping_cols1[3]
df_chunk2.loc[115:220, 'requirements'] = unzip_scraping_cols1[4]

In [60]:
df_chunk2.head(2)

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,...,category,timeSpent,publishDate,level,paidBool,review,distRating,avgRating,courseId,requirements
115,783748,Byte-Sized-Chunks: Cascading Style Sheets (CSS...,https://www.udemy.com/css-with-html/,True,20.0,2160,16,28,All Levels,4 hours,...,WebDevelopment,4.0,2016-03-10,All Levels,True,,"{(12, 1), (0, 1), (11, 1), (1, 1)}","[""averageRating"":4.42346]","[""courseId"":783748]",[Any modern browser and a simple text editor a...
116,161986,Learn Building Your Own Website Without Coding,https://www.udemy.com/learn-building-your-own-...,True,50.0,7660,7,19,All Levels,2 hours,...,WebDevelopment,2.0,2014-03-21,All Levels,True,,,,,[]


In [61]:
#df_chunk2.loc[115:220, :].to_csv('../Data/interim/df_chunk2.csv', index=False)

Now, iteratively:

In [79]:
def chunk_function(a, b):
    df_chunk =  df_courses.loc[a: b, :]
    scraping_cols = []

    for url in df_chunk.url:
        scraping_cols.append(webScraping(url))
        time.sleep(15)
        
    df_chunk.loc[a: b, 'review'] = unzip_scraping_cols[0]
    df_chunk.loc[a: b, 'distRating'] = unzip_scraping_cols[1]
    df_chunk.loc[a: b, 'avgRating'] = unzip_scraping_cols[2]
    df_chunk.loc[a: b, 'courseId'] = unzip_scraping_cols[3]
    df_chunk.loc[a: b, 'requirements'] = unzip_scraping_cols[4]

    return df_chunk

In [83]:
df_chunk3 = chunk_function(221, 320)
sleep(120)
df_chunk4 = chunk_function(321, 420)
sleep(120)
df_chunk5 = chunk_function(421, 520)

ValueError: Must have equal len keys and value when setting with an iterable

In [64]:
df_chunk3 =  df_courses.loc[221:320, :]

In [67]:
# scraping_cols2 = []

# for url in df_chunk3.url:
#     scraping_cols2.append(webScraping(url))
#     time.sleep(15)

In [69]:
#df_chunk3.url

In [66]:
scraping_cols2

[(None, None, None, None, []),
 (None,
  {('1707', '707'), ('313', '707'), ('798', '707'), ('91', '707')},
  ['"averageRating":4.20888'],
  ['"courseId":11174'],
  ['Internet, Windows/MAC/Linux', 'Be prepared for an awesome course!']),
 (None, None, None, None, []),
 (None, None, None, None, []),
 (None,
  {('118', '77'), ('17', '77'), ('276', '77'), ('32', '77'), ('377', '77')},
  ['"averageRating":4.27904'],
  ['"courseId":1252630'],
  ['Students who have some JavaScript experience will be most comfortable.']),
 (None,
  {('10', '71'), ('179', '71'), ('371', '71'), ('50', '71'), ('7', '71')},
  ['"averageRating":4.54328'],
  ['"courseId":1000010'],
  ['No prior experience necessary - enthusiasm and willingness to learn is all you need!',
   'A modern web browser and internet connection',
   'Software used in the course is free',
   'Some web basics like HTML/CSS will be helpful but not necessary']),
 (None,
  {('130', '24'), ('18', '24'), ('224', '24'), ('59', '24'), ('7', '24')},
  