# Web Scraping to complete the dataset

In [1]:
import re
import time
import pandas as pd
import seaborn as sns

Web Scraping packages:

In [2]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup as soup
from urllib.error import URLError

In [3]:
df_courses = pd.read_csv('../Data/interim/Courses.csv')

In [4]:
df_courses.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,WebDevelopment,4.0,2013-01-03,All Levels,True
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,2012-06-18T16:52:34Z,WebDevelopment,12.5,2012-06-18,All Levels,True
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,True,50.0,2577,529,64,All Levels,4.5 hours,2016-06-30T16:57:08Z,WebDevelopment,4.5,2016-06-30,All Levels,True
3,197836,Projects in HTML5,https://www.udemy.com/projects-in-html5/,True,60.0,8777,206,75,Intermediate Level,15.5 hours,2014-06-17T05:43:50Z,WebDevelopment,15.5,2014-06-17,Intermediate Level,True
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,True,20.0,23764,490,58,Beginner Level,5.5 hours,2015-10-17T04:52:25Z,WebDevelopment,5.5,2015-10-17,Beginner Level,True


In [5]:
global headers

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }

In [30]:
def soupScraping(url):
    """
    This method receives the url of the web page and it creates the BeautifulSoup object to find classes and 
    key tags to extract description, review, rating, ID and requirements of every course calling the methods above.
    """
    
    req = Request(url, headers=headers) 
    try:
        
        webpage = urlopen(req).read()
        soup_page = soup(webpage, 'html.parser')
    
        return soup_page.text
    
    except:
        return None

In [7]:
def description(soup):
    """
    Extraction of featured review using the BeautifulSoup object as input to find the text into the class called
    ud-component--clp--featured-review-content. Return the featured review of every course if it exists. Otherwise,
    return None.
    """
    try:
        description = soup.find('div', class_ ='show-more--content--isg5c show-more--with-gradient--2abmN')
        description = description.text
    except:
        description = None
        
    return description

In [8]:
def AverageRating(soup):
    """
    Use of soup object as input to extract the average rating of every course. Return the average score 
    as string if it exists. Otherwise, the method return None.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--course-landing-page-udlite--reviews')
        dist_rating_str = dist_rating['data-component-args']
        averageRating = re.findall(r'"averageRating":\d.\d+', dist_rating_str)
    except:
        averageRating = None
        
    return averageRating

In [9]:
def RequirementsExtraction(soup):
    """
    Extraction of requirements of every course, finding the tag <li> into the class requirements__item. Return 
    a string with the list of requirements. Otherwise, return None.
    """
    try:
        requirements = soup.find('div', class_ ='ud-component--course-landing-page-udlite--requirements')
        req = requirements.text
    except:
        req = None
        
    return req

In [10]:
def RatingExtraction(soup):
    """
    Extraction of distribution of rating, it means, a tuple with ratings and number of counts. Return None 
    if the distribution doesn't exist.
    """
    try:
        dist_rating = soup.find('div', class_ ='ud-component--course-landing-page-udlite--reviews')
        dist_rating_str = dist_rating['data-component-args']
        counts = re.findall(r'"count":\d+', dist_rating_str)
        rating = re.findall(r'"rating":\d+', dist_rating_str)
        count, rank = [], []
        for c in counts:
            count.append(c[8:])

        for rt in rating:
            rank.append(rt[9:])

        rating_dist = set(zip(count, rank))
    except:
        rating_dist = None
    
    return rating_dist

In [11]:
def webScraping(url):
    """
    This method receives the url of the web page and it creates the BeautifulSoup object to find classes and 
    key tags to extract description, review, rating, ID and requirements of every course calling the methods above.
    """
    
    req = Request(url, headers=headers) 
    try:
        
        webpage = urlopen(req).read()
        soup_page = soup(webpage, 'html.parser')

        descript = description(soup_page)
        avgRating = AverageRating(soup_page)
        requirements = RequirementsExtraction(soup_page) 
        rating_dist = RatingExtraction(soup_page)
    
        return descript, avgRating, requirements, rating_dist
    except:
        return None, None, None, None

Group courses by category to determine how many samples we must select randomly:

In [12]:
df_courses.groupby('category').count()

Unnamed: 0_level_0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,timeSpent,publishDate,level,paidBool
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BussinessFinance,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096
GraphicDesign,523,523,523,523,523,523,523,523,523,523,523,523,523,523,523
MusicInstrument,608,608,608,608,608,608,608,608,608,608,608,608,608,608,608
WebDevelopment,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124


To guarantee a balanced dataset, we select 500 courses by category. This new dataframe is stored as `df_samples`:

In [13]:
df_samples = df_courses.groupby('category').apply(lambda x: x.sample(n=500, random_state=42))
df_samples.reset_index(drop=True, inplace=True)

In [14]:
df_samples.groupby('category').count()

Unnamed: 0_level_0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,timeSpent,publishDate,level,paidBool
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BussinessFinance,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500
GraphicDesign,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500
MusicInstrument,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500
WebDevelopment,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500


In [31]:
def chunker(seq, size):
    k=0
    for pos in range(0, len(seq), size):
        df = seq[pos:pos + size]
        for url in df.url:
            soupScraper = soupScraping(url)
            with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
                f.write(soupScraper)
            k+=1
            time.sleep(15)
            
        time.sleep(60)
        
    return 

In [15]:
df_chunk = df_samples.loc[0: 199]

In [22]:
#chunker(df_chunk , 100)

In [26]:
df_chunk1 = df_samples.loc[90: 199]

In [27]:
df_chunk1.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
90,734280,Aprende a llevar la contabilidad de forma info...,https://www.udemy.com/contabilidad-informatizada/,True,25.0,6,1,27,Beginner Level,4.5 hours,2016-02-08T16:55:55Z,BussinessFinance,4.5,2016-02-08,Beginner Level,True
91,934574,Basics of Commerce A Complete Study,https://www.udemy.com/basics-of-commerce-a-com...,True,150.0,2197,4,105,All Levels,5.5 hours,2016-12-21T16:30:33Z,BussinessFinance,5.5,2016-12-21,All Levels,True
92,231256,FOREX Currency Trading For Beginners,https://www.udemy.com/forex-currency-trading-f...,True,25.0,35,2,28,Beginner Level,4 hours,2014-06-17T22:18:21Z,BussinessFinance,4.0,2014-06-17,Beginner Level,True
93,479688,Cost Accounting Labour Costing (Professional C...,https://www.udemy.com/labour-costing/,True,20.0,1542,1,24,All Levels,2 hours,2015-04-20T18:33:40Z,BussinessFinance,2.0,2015-04-20,All Levels,True
94,985922,Excel Crash Course: Master Excel for Financial...,https://www.udemy.com/excel-crash-course-maste...,True,105.0,8121,689,40,All Levels,3.5 hours,2016-10-18T00:51:59Z,BussinessFinance,3.5,2016-10-18,All Levels,True


In [32]:
df_chunk2 = df_samples.loc[124: 199]

In [34]:
k = 124
for url in df_chunk2.url:
    soupScraper = soupScraping(url)
    with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
        f.write(soupScraper)
    k+=1
    time.sleep(15)

In [35]:
df_chunk3 = df_samples.loc[200: 299]

In [38]:
df_chunk4 = df_samples.loc[277: 299]

In [39]:
k = 277
for url in df_chunk4.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [40]:
df_chunk5 = df_samples.loc[300: 399]

In [42]:
k = 300
for url in df_chunk5.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [43]:
df_chunk6 = df_samples.loc[400: 499]

In [44]:
k = 400
for url in df_chunk6.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [None]:
df_chunk7 = df_samples.loc[500: 599]

In [None]:
k = 500
for url in df_chunk7.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [14]:
soups = []

for 

for url in df_chunk.url:
    soups.append(soupScraping(url))
    time.sleep(15)

In [18]:
df_samples.shape

(200, 16)

In [19]:
df_chunk1 = df_samples.loc[100:]

In [20]:
soups1 = []

for url in df_chunk1.url:
    soups1.append(soupScraping(url))
    time.sleep(15)

Saving the soups:

In [48]:
soups[0].content

In [33]:
df_chunk['soup'] = soups

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
df_chunk1['soup'] = soups1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [37]:
df_chunkSamples = pd.concat([df_chunk, df_chunk1], axis=0)

In [38]:
df_chunkSamples.to_csv('../Data/interim/dataframe_scraping.csv', index=False)

In [49]:
k=0
for s in soups:
    with open('../Data/interim/scraping/'+str(k)+'.html', "w") as f:
        f.write(s.text)
        k+=1

In [51]:
with open('../Data/interim/scraping/0.html') as f:
    n = soup(f)

In [57]:
RequirementsExtraction(n) 

Load the dataframe:

In [39]:
df_scraping = pd.read_csv('../Data/interim/dataframe_scraping.csv')

In [42]:
type(df_scraping.url[0])

str

In [43]:
url = df_courses.url[0]
webScraping(url)

(None, None, None, None)

In [50]:
url = 'https://www.udemy.com/trading-penny-stocks-advanced-strategies/'

In [51]:
req = Request(url, headers=headers) 
webpage = urlopen(req).read()
page_soup = soup(webpage, 'html.parser')

In [52]:
assert page_soup is not None, "Soup is empty"

In [62]:
dist_rating = page_soup.find('div', class_ ='ud-component--clp--course-reviews-display')
dist_rating_str = dist_rating['data-component-args']
re.findall(r'"averageRating":\d.\d+', dist_rating_str)

['"averageRating":3.84848']

In [60]:
dist_rating

In [55]:
avgRating = AverageRating(page_soup)
avgRating

In [47]:
rating_dist = RatingExtraction(page_soup)
rating_dist

{('111', '3'), ('19', '1'), ('274', '4'), ('28', '2'), ('491', '5')}

In [48]:
descript = description(page_soup)
descript

"** Course Updated November 26th, 2014! ** New lecture added: Code a CSS Sticky Footer  - - -  The Best Way to Learn HTML5 & CSS3  PSD to HTML5/CSS3 is a simple course that will teach you to take a Photoshop Mockup Design and hand-code it into a beautiful, semantic, valid HTML5 & CSS3 website.  Start Speaking the Language Right Away  Just like learning a human language, the best way of learning is by speaking from day 1—this course is similar in the sense that we'll begin speaking HTML5 & CSS3 right away!   I believe this is the best way to learn HTML5 and CSS3.  It's simple. It's easy. You can do it!  If you’ve always wanted to know how to build your own website, or have wanted a simple and comprehensive way to dive into PSD to HTML5 & CSS3, this course is definitely for you.  Real World Example  One of my students, who had no previous web design experience, took my course and ran with the skills he acquired. He built a brand new website for his Barbershop in Vancouver!  So, what are 

In [49]:
requirements = RequirementsExtraction(page_soup) 
requirements

'RequirementsAdobe Photoshop CS3 or higherVery basic knowledge of HTML & CSSText Editor (TextWranger, Espresso, or Coda recommended)'

In [43]:
# def chunker(seq, size):
#     scraping_cols = []
#     for pos in range(0, len(seq), size):
#         df = seq[pos:pos + size]
#         for url in df.url:
#             scraping_cols.append(webScraping(url))
#             time.sleep(15)
            
#     unzip_scraping_cols = list(zip(*scraping_cols)) 
#     time.sleep(10)
        
#     return unzip_scraping_cols

# unzip_scraping_cols = chunker(df, 100)
# df_courses.loc[1000:2000, 'review'] = unzip_scraping_cols[0]
# df_courses.loc[1000:2000:, 'avgRating'] = unzip_scraping_cols[1]
# df_courses.loc[1000:2000:, 'requirements'] = unzip_scraping_cols[2]