# Extracting descriptions, reviews, ratings, target audience and more useful features from html files of courses

In [1]:
import re
import pandas as pd

Web Scraping packages:

In [2]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup as soup

In [3]:
def description(soup_str):
    """
    Return the description of every course if it exists. Otherwise,
    return None.
    """
    try:
        description = soup_str[soup_str.find('Description'): soup_str.find('Who this course is for:')]
        description = description.replace('\n', '').replace('\xa0', '').replace('Description', '')
    except:
        description = None
        
    return description

In [4]:
def target_audience(soup_str):
    try:
        target = soup_str[soup_str.find('Who this course is for:'):soup_str.find('Featured review')]
        target = target.replace('\n', '').replace('\xa0', '').replace('Who this course is for:', '')
    except:
        target = None
        
    return target

In [5]:
def featured_review(soup_str):
    try:
        review = soup_str[soup_str.find('Featured review'): soup_str.find('•')]
        review = review.replace('\n', '').replace('\xa0', '').replace('Featured review', '')
    except:
        review = None
        
    return review

In [6]:
def ratingValue(soup_str):
    try:
        rating_str = re.search(r'"ratingValue":"\d.\d+"', soup_str).group()
        rating = rating_str[15:18]
        rating_float = float(rating)
    except:
        rating_float = None
        
    return rating_float

In [7]:
def audienceType(soup_str):
    audience = soup_str[soup_str.find('"audienceType'): soup_str.find('"@type":"Audience"')]
    audience = audience.replace('"audienceType":', '').replace('"', '').replace('[', '').replace('],', '')
    audience_list = audience.split(',')
    
    return audience_list

In [8]:
def requirements(soup_str):
    requirements = soup_str[soup_str.find('Requirements'): soup_str.find('Who this course is for:')]
    requirements = requirements.replace('Requirements', '').replace('"', '').replace('\n', '').replace('\xa0', '')
    
    return requirements

In [9]:
def courseId(soup_str):
    try:
        course = re.search(r'"course:\d+"', soup_str).group()
        course_int = int(course.replace('"', '')[7:])
    except:
        course_int = None
    return course_int

In [10]:
def ratingCount(soup_str):
    try:
        ratingCount = re.search(r'"ratingCount":\d+', soup_str).group() 
        rating = int(ratingCount[14:])
    except:
        rating = None
    return rating

Example:

In [11]:
with open('../Data/interim/scraping/course0.html') as f:
    s = soup(f, 'html')
    soup_str = s.prettify()
    descript = description(soup_str)
    review = featured_review(soup_str)
    rating = ratingValue(soup_str)
    audience = audienceType(soup_str)
    counter = ratingCount(soup_str)
    course = courseId(soup_str)
    req = requirements(soup_str)

In [12]:
descript

"Accounting is one of the most important skills for people pursuing a career in Finance.It helps you understand whether a business is profitable.It gives you an idea of a company’s size.It helps you use the past in order to take action in the present and change the future.However, it’s essential that you understand it well. If you want to become…a Financial Analystan Accountantan Auditora Business Analysta Financial Controllera Financial Managera CFOa CEOan Investment Bankeran Equity Research Analystan Investor an Entrepreneur Someone who is involved with a business and would like to be successfulThen you simply have to learn Accounting and Financial Statement Analysis. There is no way around it.But how can you do that if you have very limited time and no prior training? And how can you be sure that you are not missing an important piece of the puzzle?Accounting &amp; Financial Statement Analysis: Complete Training is here for you. One of the best Finance courses available on Udemy, it

In [13]:
review

'Hartadi Anggoro Pamungkas(1 course,1 review)a year agoIt was really really easy to understand, really efficient in terms of time, and very effective. You don\'t need much effort to understand what the instructor teaching, not to mention I can practice it easily, thanks to the material that was arranged very well with a balance proportion between theory and practice material. Awesome! Love your work!Course contentExpand all54 lectures03:17:52+–Introduction to Accounting - Why Accounting Is Important?5 lectures11:59What Does The Course Cover?Preview01:14What Is Accounting and Why Do We Need It?Preview05:27Why do we need Accounting?1 questionThe Importance of Bookkeeping - Good Accounting Records Are EssentialPreview02:34The importance of Bookkeeping1 questionFinancial Accounting - Who Needs Financial Reports?Preview02:44Financial Accounting1 questionCourse Notes - Download1 page+–The Three Main Statements in Financial Accounting7 lectures26:15The Three Main Financial Statements: P&amp;L

In [14]:
rating

4.6

In [15]:
audience

['Aspiring Accountants and Financial Analysts',
 'Aspiring Investment Bankers',
 'Aspiring Financial Controllers',
 'Business Owners',
 'Accounting Students',
 "Anyone wishing to understand a company's financials",
 'Anyone wishing to be successful in the world of Business \\u0026 Finance']

In [16]:
counter

3663

In [17]:
course

640100

In [18]:
req

"No prior knowledge of accounting is assumed or neededThere is nothing to buy or pay for - everything is included for freeYou'll need a notebook and a penDescriptionAccounting is one of the most important skills for people pursuing a career in Finance.It helps you understand whether a business is profitable.It gives you an idea of a company’s size.It helps you use the past in order to take action in the present and change the future.However, it’s essential that you understand it well. If you want to become…a Financial Analystan Accountantan Auditora Business Analysta Financial Controllera Financial Managera CFOa CEOan Investment Bankeran Equity Research Analystan Investor an Entrepreneur Someone who is involved with a business and would like to be successfulThen you simply have to learn Accounting and Financial Statement Analysis. There is no way around it.But how can you do that if you have very limited time and no prior training? And how can you be sure that you are not missing an im