In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import requests
import json
import math
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Data Scraping 

In [2]:
from webdriver_manager.microsoft import EdgeChromiumDriverManager
driver = webdriver.Edge(EdgeChromiumDriverManager().install())

[WDM] - Current edge version is 98.0.1108
[WDM] - Get LATEST edgedriver version for 98.0.1108 Edge
[WDM] - Trying to download new driver from https://msedgedriver.azureedge.net/98.0.1108.62/edgedriver_mac64.zip
[WDM] - Driver has been saved in cache [/Users/yiutongchiu/.wdm/drivers/edgedriver/mac64/98.0.1108.62]
  driver = webdriver.Edge(EdgeChromiumDriverManager().install())


In [8]:
url_eng = 'https://www.classcentral.com/subject/cs?lang=english'
url_cert = 'https://www.classcentral.com/subject/cs?lang=english&certificate=true'
url_beginner = 'https://www.classcentral.com/subject/cs?lang=english&level=beginner'
url_interim = 'https://www.classcentral.com/subject/cs?lang=english&level=intermediate'
url_advanced = 'https://www.classcentral.com/subject/cs?lang=english&level=advanced'

url = {'English' : [url_eng, False],
       'Beginner' : [url_beginner, True], 
       'Intermediate' : [url_interim, True], 
       'Advanced' : [url_advanced, True], 
       'With Cert' : [url_cert, True]}

# Define the Function Counting How Many Pages to Scrape
def total_page(url):
    driver.get(url)
    content = driver.page_source
    soup = BeautifulSoup(content)
    x_path = '//*[@id="page-subject"]/div[1]/div[3]/div[3]/div/span[2]/span/span[contains(@class, "weight-bold")]'
    num_course = int(driver.find_element(By.XPATH, x_path).text.split()[0].replace(',', ''))
    pages = int(math.ceil(num_course / 15))
    return pages

## Scrape All of the CS Courses in English Version

## Define the Scraper Function

In [11]:
def df_course(url, url_type, is_type = True):
    
    courses = []
    institutions = []
    num_page = total_page(url)
    with_type = []
    providers = [] # Nominal
    workload_dur = [] # Workload + Duration Numeric
    ratings = [] # Ordinal
    pricing = [] # Ordinal
    reviews = [] # Numeric
    
    if is_type == True:
        for num in range(1, num_page + 1):
            driver.get(url + '&page=%s' % str(num))
            content = driver.page_source
            soup = BeautifulSoup(content)
        # Extract the Institution, Course Name 
            for course in soup.find_all('p', {'class' : 'text-2 margin-bottom-xsmall'}):
                data_dict = json.loads(course.a.attrs['data-track-props'])['clickMetadata']
                institutions.append(data_dict['institution'])
                courses.append(data_dict['course'])
                with_type.append('Yes')
        
        df = pd.DataFrame({'Course' : courses, 
                           'Institution' : institutions, 
                           url_type : with_type})
    
    else:
        for num in range(1, num_page + 1):
            driver.get(url + '&page=%s' % str(num))
            content = driver.page_source
            soup = BeautifulSoup(content)
        # Extract the Institution, Course Name and Provider
            for course in soup.find_all('p', {'class' : 'text-2 margin-bottom-xsmall'}):
                data_dict = json.loads(course.a.attrs['data-track-props'])['clickMetadata']
                institutions.append(data_dict['institution'])
                courses.append(data_dict['course'])
                providers.append(data_dict['provider'])
        # Extract the Rating    
            for rate in soup.find_all('span', {'class' : 'cmpt-rating-medium'}):
                ratings.append(rate['aria-label'])
        # Extract the Pricing      
            for price in soup.find_all('span', {'aria-label' : 'Pricing'}):
                pricing.append(price.text.strip())
        # Extract the Workload and Duration
            for ul in soup.find_all('ul', {'class' : 'margin-top-small'}):
                if ul.find('span', {'aria-label' : 'Workload and duration'}) is not None:
                    workload_dur.append(ul.find('span', {'aria-label' : 'Workload and duration'}).text.strip())
                else:
                    workload_dur.append(np.nan)
        # Extract the Number of Reviews
            for a in soup.find_all('a', {'class' : 'hover-no-underline margin-bottom-xxsmall row vert-align-middle'}):
                if a.find('span', {'class': 'text-3 color-gray margin-left-xxsmall'}) is not None:
                     reviews.append(int(a.find('span', {'class': 'text-3 color-gray margin-left-xxsmall'}).text.strip().split()[0]))
                else:
                     reviews.append(np.nan)
        
        df = pd.DataFrame({'Course' : courses, 
                           'Institution' : institutions, 
                           'Provider' : providers, 
                           'Workload_Duration' : workload_dur, 
                           'Pricing' : pricing, 
                           'Review' : reviews, 
                           'Rating' : ratings})
    return df

## Scrape Dataframes of All Courses, Beginner Level Courses, Intermediate Level Courses, Advanced Level Courses and Courses with Cert.

In [12]:
# df_eng = df_course(url_eng, 'English', False) 

# df_beginner = df_course(url_beginner, 'Beginner', True)

# df_interim = df_course(url_interim, 'Intermediate', True)

# df_advanced = df_course(url_advanced, 'Advanced', True)

# df_cert = df_course(url_cert, 'Cert', True)

In [13]:
df_eng, df_beginner, df_interim, df_advanced, df_cert = [df_course(value[0], key, value[1]) for key, value in url.items()]

## Remove the Duplicated Rows in Each Dataframe

In [None]:
# Group the row of each Dataframes by ['Course', 'Institution'] to remove duplicate rows
df_eng = df_eng.groupby(['Course', 'Institution']).size().reset_index(name = 'Freq')

In [None]:
df_beginner = df_beginner.groupby(['Course', 'Institution']).size().reset_index(name = 'Freq')

In [None]:
df_interim = df_interim.groupby(['Course', 'Institution']).size().reset_index(name = 'Freq')

In [None]:
df_advanced = df_advanced.groupby(['Course', 'Institution']).size().reset_index(name = 'Freq')

In [None]:
df_cert = df_cert.groupby(['Course', 'Institution']).size().reset_index(name = 'Freq')

In [None]:
df = pd.merge(df_eng, df_beginner, how = 'left', on = ['Course', 'Institution'])

In [None]:
for df_i in [df_interim, df_advanced, df_cert]:
    df.merge(df_i,  how = 'left', on = ['Course', 'Institution'])

In [None]:
df

# Convert the Data Type of Each Columns

## Remove Rows without Review

In [None]:
df1 = df.copy()
df1 = df1.dropna(subset = ['Review'])

In [None]:
df1.info()

## Convert Rating to Numeric Data

In [None]:
rating_list = df1['Rating'].unique().tolist()
rating_num = [4.5, 5, 4, 3, 3.5, 2, 0, 1, 1.5]
rating_dict = dict(zip(rating_list, rating_num))

df1['Rating'] = df1['Rating'].map(rating_dict)

## Convert Pricing into Categories Free Trail, Free and Paid

In [None]:
pricing_list = df1['Pricing'].unique().tolist()

def pricing_to_cat(p):
    if 'Free Trial' in p:
        return 'Free Trial'
    elif 'Free Online' in p:
        return 'Free'
    else:
        return 'Paid'

pricing_cat = [pricing_to_cat(p) for p in pricing_list]

pricing_cat_dict = dict(zip(pricing_list, pricing_cat))

df1['Pricing'] = df1['Pricing'].map(pricing_cat_dict)

In [None]:
df1.info()

### Split Work_Duration Column into Workload and Duration Columns

In [None]:
# Use Regex to Split 
work_list = []
for i in df1['Workload_Duration']:
    if i is np.nan:
        i = 'None'
        
    if re.search("hours?", i) and re.search("weeks? long", i):
        work_list.append(i.split(','))
    elif re.search("hours?",i):
        work_list.append([i, 'None'])
    elif re.search("weeks?", i):
        work_list.append(['None', i])
    else:
        work_list.append(['None', 'None'])
    
# def find_work_dur(x):
#     if (:
#         return x.split(',')
#     elif ('hours' or 'hour') and ('weeks' or 'weeks')in x:
#         return ['None' , x]

In [None]:
workload = []
duration = []
for i in work_list:
    workload.append(i[0])
    duration.append(i[1])
#     duration.append(i[1])

### Drop Workload_Duration Column and Add Workload and Duration Columns

In [None]:
df1['Workload'] = workload
df1['Duration'] = duration

In [None]:
df1 = df1.drop( columns = 'Workload_Duration')

### Strip the string of Workload and Duration

In [None]:
df1['Workload']= df1['Workload'].apply(lambda x : x.strip().split()[0])

In [None]:
df1['Duration'] = df1['Duration'].apply(lambda x : x.strip().split()[0])

### Convert the Workload value of a range into the median (2 - 4 hours -> 3 hours)

In [None]:
def range_med(x):
    if '-' in x:
        x = (float(x.split('-')[0] )+ float(x.split('-')[1])) / 2
    if x == 'None' or x == 'Less':
        x = np.nan
    return float(x)

In [None]:
df1['Workload'] = df1['Workload'].apply(lambda x : range_med(x))

In [None]:
df1['Duration'] = df1['Duration'].apply(lambda x : range_med(x))

### Convert Duration Column to Float and 'None' to numpy.nan

In [None]:
df1['Duration'].unique()

In [None]:
df1['Duration'] = df1['Duration'].apply(lambda x : np.nan if x == 'None' else float(x))

## Split the data

In [None]:
df1.info()

In [None]:
df1.describe()

In [None]:
df1

## Combine the Same Course with Different Languages 

In [None]:
df1['Provider'].value_counts()

In [None]:
pd.set_option('display.max_rows', None)
df1['Institution'].value_counts()

## Deal with Categorical Variables

### Label Encoding of Provider, Institution and Pricing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [None]:
oe = OrdinalEncoder()
df

## Machine Learning


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
df1