# Web Scraping to complete the dataset

In [27]:
import re
import pandas as pd
import seaborn as sns
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup

In [10]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

In [2]:
df_courses = pd.read_csv('../Data/interim/Courses.csv')

In [3]:
df_courses.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,WebDevelopment,4.0,2013-01-03,All Levels,True
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,2012-06-18T16:52:34Z,WebDevelopment,12.5,2012-06-18,All Levels,True
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,True,50.0,2577,529,64,All Levels,4.5 hours,2016-06-30T16:57:08Z,WebDevelopment,4.5,2016-06-30,All Levels,True
3,197836,Projects in HTML5,https://www.udemy.com/projects-in-html5/,True,60.0,8777,206,75,Intermediate Level,15.5 hours,2014-06-17T05:43:50Z,WebDevelopment,15.5,2014-06-17,Intermediate Level,True
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,True,20.0,23764,490,58,Beginner Level,5.5 hours,2015-10-17T04:52:25Z,WebDevelopment,5.5,2015-10-17,Beginner Level,True


In [29]:
headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }

In [170]:
req = Request(df_courses.url[1], headers=headers) 
webpage = urlopen(req).read()

In [171]:
page_soup = soup(webpage, 'html.parser')

In [179]:
#page_soup

# Finding relevant features 

Using this `page_soup` we proceed to extract some features before to create the functions that will replicate the same for all the courses in the catalog of the dataset.

### First Review:

In [173]:
review = page_soup.find('div', class_ ='ud-component--clp--featured-review-content')
review

### Ranking Distribution 

In [112]:
dist_rating = page_soup.find('div', class_ ='ud-component--clp--course-reviews-display')
dist_rating

<div class="ud-component--clp--course-reviews-display" data-component-args='{"ratingDistribution":[{"count":19,"rating":1},{"count":27,"rating":2},{"count":110,"rating":3},{"count":270,"rating":4},{"count":490,"rating":5}],"topReviewAttributes":[],"averageRating":4.5738,"showUserAvatar":true,"isFreeSEOExp":false,"courseId":28295,"page":"clp"}'>
</div>

In [114]:
dist_rating_str = dist_rating['data-component-args']
dist_rating_str

'{"ratingDistribution":[{"count":19,"rating":1},{"count":27,"rating":2},{"count":110,"rating":3},{"count":270,"rating":4},{"count":490,"rating":5}],"topReviewAttributes":[],"averageRating":4.5738,"showUserAvatar":true,"isFreeSEOExp":false,"courseId":28295,"page":"clp"}'

In [129]:
counts = re.findall(r'"count":\d+', dist_rating_str)
rating = re.findall(r'"rating":\d+', dist_rating_str)

In [139]:
count, rank = [], []
for c in counts:
    count.append(c[8:])
    
for rt in rating:
    rank.append(c[9:])
    
rating_dist = set(zip(count, rank))
print(rating_dist)

{('490', '90'), ('110', '90'), ('270', '90'), ('27', '90'), ('19', '90')}


### CourseID

In [120]:
courseId = re.findall(r'"courseId":\d+', dist_rating_str)
print(courseId)

['"courseId":28295']


### Average Rating

In [128]:
averageRating = re.findall(r'"averageRating":\d.\d+', dist_rating_str)
print(averageRating)

['"averageRating":4.5738']


### Requirements

In [126]:
requirements = page_soup.find_all('li', class_ ='requirements__item')
for requirement in requirements:
    print(requirement.text)

Adobe Photoshop CS3 or higher
Very basic knowledge of HTML & CSS
Text Editor (TextWranger, Espresso, or Coda recommended)


In [178]:
#page_soup.find_all('span')

In [177]:
#page_soup.find_all('li')

In [176]:
#page_soup.find_all('div', class_ ='clp-component-render')

## Automatization of web scraping

In [169]:
def reviewExtraction(soup):
    """
    Method description
    """
    review = soup.find('div', class_ ='ud-component--clp--featured-review-content')
    try:
        review_txt = review.text
    except:
        review_txt = review
    return review_txt

In [147]:
def RatingExtraction(soup):
    """
    Method description
    """
    dist_rating = soup.find('div', class_ ='ud-component--clp--course-reviews-display')
    dist_rating_str = dist_rating['data-component-args']
    counts = re.findall(r'"count":\d+', dist_rating_str)
    rating = re.findall(r'"rating":\d+', dist_rating_str)
    count, rank = [], []
    for c in counts:
        count.append(c[8:])
    
    for rt in rating:
        rank.append(c[9:])
    
    rating_dist = set(zip(count, rank))
    
    return rating_dist

In [148]:
def CourseId(soup):
    """
    Method description
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--clp--course-reviews-display')
        dist_rating_str = dist_rating['data-component-args']
        courseId = re.findall(r'"courseId":\d+', dist_rating_str)
    except:
    return courseId

In [149]:
def RequirementsExtraction(soup):
    """
    Method description
    """
    requirements = soup.find_all('li', class_ ='requirements__item')
    req = []
    for requirement in requirements:
        req.append(requirement.text)
    return req

In [165]:
def webScraping(url):
    """
    Method description
    """
    req = Request(url, headers=headers) 
    webpage = urlopen(req).read()
    soup_page = soup(webpage, 'html.parser')
    
    review = reviewExtraction(soup_page)
    rating = RatingExtraction(soup_page)
    courseId = CourseId(soup_page)
    requirements = RequirementsExtraction(soup_page)
    
    return review, rating, courseId, requirements

In [166]:
len_df = df_courses.shape[0]

In [167]:
df_chunk1 = df_courses[0:100]

In [175]:
#df_courses['webScrap'] = df_chunk1.url.apply(webScraping)

In [None]:
df.to_csv('..\Data\interim\df_chunk1.csv', index = False, header=True) 