# Extracting descriptions, reviews, ratings, target audience and more useful features from html files of courses

In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup as soup

In [2]:
def description(soup_str):
    """
    Return the description of every course if it exists. Otherwise,
    return None.
    """
    try:
        description = soup_str[soup_str.find('Description'): soup_str.find('Who this course is for:')]
        description = description.replace('\n', '').replace('\xa0', '').replace('Description', '')
    except:
        description = None
        
    return description

In [3]:
def target_audience(soup_str):
    try:
        target = soup_str[soup_str.find('Who this course is for:'):soup_str.find('Featured review')]
        target = target.replace('\n', '').replace('\xa0', '').replace('Who this course is for:', '')
    except:
        target = None
        
    return target

In [4]:
def featured_review(soup_str):
    try:
        review = soup_str[soup_str.find('Featured review'): soup_str.find('•')]
        review = review.replace('\n', '').replace('\xa0', '').replace('Featured review', '')
    except:
        review = None
        
    return review

In [5]:
def ratingValue(soup_str):
    try:
        rating_str = re.search(r'"ratingValue":"\d.\d+"', soup_str).group()
        rating = rating_str[15:18]
        rating_float = float(rating)
    except:
        rating_float = None
        
    return rating_float

In [6]:
def audienceType(soup_str):
    
    audience = soup_str[soup_str.find('"audienceType'): soup_str.find('"@type":"Audience"')]
    audience = audience.replace('"audienceType":', '').replace('"', '').replace('[', '').replace('],', '')
    
    if audience == '':
        try:
            audience = soup_str[soup_str.find('"audienceType":'): soup_str.find('."]')]
            audience = audience.replace('"audienceType":', '').replace('"', '').replace('[', '')        
        except:
            pass
    
    try:
        audience_list = audience.split(',')
    except:
        audience_list = ''

    return audience_list

In [7]:
def requirements(soup_str):
    requirements = soup_str[soup_str.find('Requirements'): soup_str.find('Who this course is for:')]
    try:
        requirements = requirements[requirements.find('Requirements'): requirements.find('Description')]
    except:
        pass
        
    requirements = requirements.replace('Requirements', '').replace('"', '').replace('\n', '').replace('\xa0', '')
    
    return requirements

In [8]:
def courseId(soup_str):
    try:
        course = re.search(r'"course:\d+"', soup_str).group()
        course_int = int(course.replace('"', '')[7:])
    except:
        course_int = None
    return course_int

In [9]:
def ratingCount(soup_str):
    try:
        ratingCount = re.search(r'"ratingCount":\d+', soup_str).group() 
        rating = int(ratingCount[14:])
    except:
        rating = None
    return rating

In [10]:
def language(soup_str):
    try:
        language_line = re.search(r'"language_simple_english_title":"\w+"', soup_str).group() 
        language_line = language_line.replace('"', '')
        language = language_line.split(':')[1]
        
    except:
        language = None
    return language

Example:

In [11]:
with open('../Data/interim/scraping/course0.html') as f:
    s = soup(f, 'html')
    soup_str = s.prettify()
    descript = description(soup_str)
    rating = ratingValue(soup_str)
    audience = audienceType(soup_str)
    counter = ratingCount(soup_str)
    course = courseId(soup_str)
    req = requirements(soup_str)
    lang = language(soup_str)

In [12]:
descript

"Accounting is one of the most important skills for people pursuing a career in Finance.It helps you understand whether a business is profitable.It gives you an idea of a company’s size.It helps you use the past in order to take action in the present and change the future.However, it’s essential that you understand it well. If you want to become…a Financial Analystan Accountantan Auditora Business Analysta Financial Controllera Financial Managera CFOa CEOan Investment Bankeran Equity Research Analystan Investor an Entrepreneur Someone who is involved with a business and would like to be successfulThen you simply have to learn Accounting and Financial Statement Analysis. There is no way around it.But how can you do that if you have very limited time and no prior training? And how can you be sure that you are not missing an important piece of the puzzle?Accounting &amp; Financial Statement Analysis: Complete Training is here for you. One of the best Finance courses available on Udemy, it

In [13]:
rating

4.6

In [14]:
audience

['Aspiring Accountants and Financial Analysts',
 'Aspiring Investment Bankers',
 'Aspiring Financial Controllers',
 'Business Owners',
 'Accounting Students',
 "Anyone wishing to understand a company's financials",
 'Anyone wishing to be successful in the world of Business \\u0026 Finance']

In [15]:
counter

3663

In [16]:
course

640100

In [17]:
req

"No prior knowledge of accounting is assumed or neededThere is nothing to buy or pay for - everything is included for freeYou'll need a notebook and a pen"

In [18]:
lang

'English'

In [19]:
descriptionList = []
ratingList = []
audienceList = []
counterList = []
courseIdList = []
requirementsList = []
languageList = []

In [20]:
for i in range(1000):
    with open('../Data/interim/scraping/course'+str(i)+'.html') as f:
        s = soup(f, 'html')
        soup_str = s.prettify()
        #Append information on the respective list
        descriptionList.append(description(soup_str))
        ratingList.append(ratingValue(soup_str))
        audienceList.append(audienceType(soup_str))
        counterList.append(ratingCount(soup_str))
        courseIdList.append(courseId(soup_str))
        requirementsList.append(requirements(soup_str))
        languageList.append(language(soup_str))

In [21]:
d = {'description': descriptionList, 
     'rating': ratingList,
     'audience': audienceList,
     'counter': counterList,
     'course': courseIdList,
     'requirements': requirementsList,
     'language': languageList}

In [22]:
assert len(d['description']) == len(d['rating']) 
assert len(d['description']) == len(d['audience'])
assert len(d['description']) == len(d['counter'])
assert len(d['description']) == len(d['course'])
assert len(d['description']) == len(d['requirements'])
assert len(d['description']) == len(d['language'])

In [23]:
df_Data = pd.DataFrame(data=d)

In [24]:
df_Data.head()

Unnamed: 0,description,rating,audience,counter,course,requirements,language
0,Accounting is one of the most important skills...,4.6,"[Aspiring Accountants and Financial Analysts, ...",3663.0,640100.0,No prior knowledge of accounting is assumed or...,English
1,This course is an introduction to the financia...,3.6,"[Students in business and Finance, Auditors, I...",33.0,385604.0,some knowledge of accounting,English
2,*Course Fully Updated for May 2019*The don’t c...,4.7,[Anyone interested in earning an extra income ...,300.0,834836.0,You will need some basic knowledge of stock an...,English
3,This Mortgage Acceleration course will teach y...,3.7,[This Mortgage Acceleration course is designed...,7.0,504620.0,Students will need a reliable computer and int...,English
4,"This course is for bookkeepers, accountants an...",3.9,[Individuals / Directors who want to submit th...,10.0,359926.0,It would be helpful if you understood accounti...,English


In [25]:
df_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
description     1000 non-null object
rating          757 non-null float64
audience        1000 non-null object
counter         757 non-null float64
course          757 non-null float64
requirements    1000 non-null object
language        998 non-null object
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


Save scraped data in the `interim` folder:

In [27]:
df_Data.to_csv('../Data/interim/df_scrapedData.csv', index=False)