# Web Scraping to complete the dataset

In [1]:
import re
import time
import pandas as pd
import seaborn as sns

Web Scraping packages:

In [2]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup as soup
from urllib.error import URLError

In [3]:
df_courses = pd.read_csv('../Data/interim/Courses.csv')

In [4]:
df_courses.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
0,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75.0,43285,525,24,All Levels,4 hours,2013-01-03T00:55:31Z,WebDevelopment,4.0,2013-01-03,All Levels,True
1,19603,Learning Dynamic Website Design - PHP MySQL an...,https://www.udemy.com/learning-dynamic-website...,True,50.0,47886,285,125,All Levels,12.5 hours,2012-06-18T16:52:34Z,WebDevelopment,12.5,2012-06-18,All Levels,True
2,889438,ChatBots: Messenger ChatBot with API.AI and No...,https://www.udemy.com/chatbots/,True,50.0,2577,529,64,All Levels,4.5 hours,2016-06-30T16:57:08Z,WebDevelopment,4.5,2016-06-30,All Levels,True
3,197836,Projects in HTML5,https://www.udemy.com/projects-in-html5/,True,60.0,8777,206,75,Intermediate Level,15.5 hours,2014-06-17T05:43:50Z,WebDevelopment,15.5,2014-06-17,Intermediate Level,True
4,505208,Programming Foundations: HTML5 + CSS3 for Entr...,https://www.udemy.com/html-css-more/,True,20.0,23764,490,58,Beginner Level,5.5 hours,2015-10-17T04:52:25Z,WebDevelopment,5.5,2015-10-17,Beginner Level,True


In [5]:
global headers

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
        }

In [30]:
def soupScraping(url):
    """
    This method receives the url of the web page and it creates the BeautifulSoup object to find classes and 
    key tags to extract description, review, rating, ID and requirements of every course calling the methods above.
    """
    
    req = Request(url, headers=headers) 
    try:
        
        webpage = urlopen(req).read()
        soup_page = soup(webpage, 'html.parser')
    
        return soup_page.text
    
    except:
        return None

Group courses by category to determine how many samples we must select randomly:

In [12]:
df_courses.groupby('category').count()

Unnamed: 0_level_0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,timeSpent,publishDate,level,paidBool
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BussinessFinance,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096,1096
GraphicDesign,523,523,523,523,523,523,523,523,523,523,523,523,523,523,523
MusicInstrument,608,608,608,608,608,608,608,608,608,608,608,608,608,608,608
WebDevelopment,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124,1124


To guarantee a balanced dataset, we select 500 courses by category. This new dataframe is stored as `df_samples`:

In [13]:
df_samples = df_courses.groupby('category').apply(lambda x: x.sample(n=500, random_state=42))
df_samples.reset_index(drop=True, inplace=True)

In [14]:
df_samples.groupby('category').count()

Unnamed: 0_level_0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,timeSpent,publishDate,level,paidBool
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BussinessFinance,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500
GraphicDesign,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500
MusicInstrument,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500
WebDevelopment,500,500,500,500,500,500,500,500,500,500,500,500,500,500,500


In [31]:
def chunker(seq, size):
    k=0
    for pos in range(0, len(seq), size):
        df = seq[pos:pos + size]
        for url in df.url:
            soupScraper = soupScraping(url)
            with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
                f.write(soupScraper)
            k+=1
            time.sleep(15)
            
        time.sleep(60)
        
    return 

In [15]:
df_chunk = df_samples.loc[0: 199]

In [22]:
#chunker(df_chunk , 100)

In [26]:
df_chunk1 = df_samples.loc[90: 199]

In [27]:
df_chunk1.head()

Unnamed: 0,id,title,url,isPaid,price,numSubscribers,numReviews,numPublishedLectures,instructionalLevel,contentInfo,publishedTime,category,timeSpent,publishDate,level,paidBool
90,734280,Aprende a llevar la contabilidad de forma info...,https://www.udemy.com/contabilidad-informatizada/,True,25.0,6,1,27,Beginner Level,4.5 hours,2016-02-08T16:55:55Z,BussinessFinance,4.5,2016-02-08,Beginner Level,True
91,934574,Basics of Commerce A Complete Study,https://www.udemy.com/basics-of-commerce-a-com...,True,150.0,2197,4,105,All Levels,5.5 hours,2016-12-21T16:30:33Z,BussinessFinance,5.5,2016-12-21,All Levels,True
92,231256,FOREX Currency Trading For Beginners,https://www.udemy.com/forex-currency-trading-f...,True,25.0,35,2,28,Beginner Level,4 hours,2014-06-17T22:18:21Z,BussinessFinance,4.0,2014-06-17,Beginner Level,True
93,479688,Cost Accounting Labour Costing (Professional C...,https://www.udemy.com/labour-costing/,True,20.0,1542,1,24,All Levels,2 hours,2015-04-20T18:33:40Z,BussinessFinance,2.0,2015-04-20,All Levels,True
94,985922,Excel Crash Course: Master Excel for Financial...,https://www.udemy.com/excel-crash-course-maste...,True,105.0,8121,689,40,All Levels,3.5 hours,2016-10-18T00:51:59Z,BussinessFinance,3.5,2016-10-18,All Levels,True


In [32]:
df_chunk2 = df_samples.loc[124: 199]

In [34]:
k = 124
for url in df_chunk2.url:
    soupScraper = soupScraping(url)
    with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
        f.write(soupScraper)
    k+=1
    time.sleep(15)

In [35]:
df_chunk3 = df_samples.loc[200: 299]

In [38]:
df_chunk4 = df_samples.loc[277: 299]

In [39]:
k = 277
for url in df_chunk4.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [40]:
df_chunk5 = df_samples.loc[300: 399]

In [42]:
k = 300
for url in df_chunk5.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [43]:
df_chunk6 = df_samples.loc[400: 499]

In [44]:
k = 400
for url in df_chunk6.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [46]:
df_chunk7 = df_samples.loc[500: 599]

In [47]:
k = 500
for url in df_chunk7.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)

In [48]:
df_chunk8 = df_samples.loc[600: 699]

In [None]:
k = 600
for url in df_chunk8.url:
    soupScraper = soupScraping(url)
    if soupScraper is None:
        print('Process stops in course #:'+str(k))
        break
    else:
        with open('../Data/interim/scraping/course'+str(k)+'.html', "w") as f:
            f.write(soupScraper)
    k+=1
    time.sleep(15)