Stats 141XP

Bruinwalk Scraper

Dr. Sugano and Dr. Zhang Group 1

Author: Andrew Liu

Purpose: Scrape course data and reviews from bruinwalk.com. Perform sentiment analysis on review text.

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import math

###
# Function to get all courses for a department, gets all courses if no department specified
# @param department code (optional)
# @return list of course codes
###
def get_courses(dept_code = None):
    
    # base url for scraping classes
    # if no department code then scrapes all classes across all departments otherwise specify department
    base_url = 'https://www.bruinwalk.com/search/?category=classes'
    if dept_code != None:
        base_url += f'&dept={dept_code}'
    
    # ping page and get response, pass to soup
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # iterate through all pages
    all_courses = []
    paginator = int(soup.find("div", class_="paginator").find_all("span")[1].get_text().replace('1 of ', ''))
    for page in range(paginator):
        
        # go to page and get response, pass to soup
        url =f"{base_url}&page={page+1}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # get all course urls and append to master list
        courses = soup.select('a[href^="/classes/"]')
        courses = list(set([i.get('href') for i in courses]))
        all_courses += courses
    
    # extract course codes
    all_courses = [i.replace('/', '').replace('classes', '') for i in all_courses]
    return all_courses

###
# Function to get all courses for a course
# @param course code
# @return list of professors
###
def get_professors(course_code):
    
    # base url for scraping and append course code
    base_url = "https://www.bruinwalk.com"
    course = course_code.replace('-', ' ').title()
    url = f"{base_url}/classes/{course_code}"
    
    # ping page and get response, pass to soup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # iterate through all pages
    all_professors = []
    paginator = int(soup.find("div", class_="paginator").find_all("span")[1].get_text().replace('1 of ', ''))
    for page in range(paginator):
        
        # go to page and get response, pass to soup
        url =f"{base_url}/classes/{course_code}?page={page+1}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # get all professor urls and append to master list 
        professors = soup.select('a[href^="/professors/"]')
        professors = list(set([i.get('href') for i in professors]))
        all_professors += professors
    
    return all_professors

###
# Function to get all reviews for a course
# @param course code
# @ return dataframe of course data and reviews
###
def scrape_reviews(course_code):
    
    # create dataframe
    col_names = ['Course Code', 'Course Name', 'Department', 'Professor', 'Course Ratings', 'Quarter', 'Year', 'Grade', 'Review Date', 'Review Text', 'Review Upvote', 'Review Downvote']
    df = pd.DataFrame(columns = col_names)
    idx = 0
    
    # base url for scraping
    base_url = "https://www.bruinwalk.com"
        
    # get professors
    professors = get_professors(course_code)
    
    # iterate through all professors
    for i in professors:
        
        # get professor name
        start = '/professors/'
        end = f'/{course_code}/'
        start_index = i.index(start) + len(start)
        end_index = i.index(end)
        prof = i[start_index:end_index].replace('-', ' ').title()
        
        # ping professor page and get response, pass to soup 
        url = f"{base_url}{i}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # extract department name
        dep = soup.find("div", class_="department-name").get_text().strip().replace('Department of ', '')
        
        # extract course code
        course_c = soup.find("span", class_="aggregate-type-badge").get_text()
        
        # extract course name
        course_n = soup.find("div", class_="aggregate-header content-row").find('h2').get_text()
            
        # extract overall score and users
        overall_score = soup.find("div", class_="overall-score").get_text().replace(' ', '')
        replacements = ['\n', '\t', ' ', 'OverallRating', 'Basedon', 'Users', 'User']
        overall_users = soup.find("div", class_="overall-text").get_text()
        for j in replacements:
            overall_users = overall_users.replace(j, '')
        
        # extract specific ratings
        ratings = soup.find_all("div", class_="ind-rating")
        options = ['Easiness', 'Clarity', 'Workload', 'Helpfulness']
        course_ratings = {'Overall' : math.nan, 'Users': math.nan}
        if overall_score != 'N/A':
            course_ratings['Overall'] = float(overall_score)
        if overall_users != '':
            course_ratings['Users'] = float(overall_users)
        for j in options:
            course_ratings[j] = math.nan
        replacements = [' 5 ', '\n', ' ', '\t', '/']
        for j in ratings[:4]:
            val = j.find("span", class_="value").get_text()
            for k in replacements:
                val = val.replace(k, '')
            for l in options:
                if l in j.get_text():
                    if val != 'N/A':
                        course_ratings[l] = float(val)
        course_ratings = str(course_ratings)
        
        # iterate through all pages
        paginator = int(soup.find("div", class_="paginator").find_all("span")[1].get_text().replace('1 of ', ''))
        for page in range(paginator):
            
            # go to page and pass to soup
            url = f"{base_url}{i}?page={page+1}"
            response = requests.get(url)
            soup = BeautifulSoup(response.text, "html.parser")
            
            # extract reviews
            reviews = soup.find_all("div", class_="review reviewcard")
            
            # iterate through all reviews
            for j in reviews:
                
                # extract quarter and grade element
                quarter_and_grade = j.select('div[class^="row collapse"]')[0]
                quarter_year = quarter_and_grade.select('div')[0].get_text()
                grade = quarter_and_grade.select('div')[1].get_text()
                
                # extract quarter and year
                replacements = ['\n', ' ', 'Quarter:']
                for k in replacements:
                    quarter_year = quarter_year.replace(k, '')
                if quarter_year == 'N/A':
                    quarter = 'N/A'
                    year = 'N/A'
                else:
                    quarter_year = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', quarter_year).split(' ')
                    quarter = quarter_year[0]
                    year = quarter_year[1]
                
                # extract grade
                replacements = ['\n', ' ', 'Grade:']
                for k in replacements:
                    grade = grade.replace(k, '')
                
                # extract review date
                review_date = j.select('span[class^="date"]')[0].get_text()
                replacements = ['\n', ' ']
                for k in replacements:
                    review_date = review_date.replace(k, '')
                if '.' in review_date:
                    index = review_date.index('.')
                    review_date = review_date[:index][:3] + review_date[index:]
                # convert to datetime, standardize formatting
                input_formats = ["%b.%d,%Y", "%B%d,%Y"]
                for k in input_formats:
                    try:
                        review_date = datetime.strptime(review_date, k)
                        break
                    except:
                        pass
                output_format = "%m/%d/%Y"
                review_date = review_date.strftime(output_format)
                
                # extract review text
                review_text = j.find("div", class_="expand-area review-paragraph").get_text().replace('\n', '')
                
                # extract review upvote value
                review_upvote = int(j.find("span", class_="upvote-value").get_text())
                
                # extract review downvote value
                review_downvote = int(j.find("span", class_="downvote-value").get_text())
                
                # append to dataframe and increment index
                df.loc[idx] = [course_c, course_n, dep, prof, course_ratings, quarter, year, grade, review_date, review_text, review_upvote, review_downvote]
                idx += 1
    
    #drop duplicates
    df = df.drop_duplicates(keep = 'first').reset_index(drop = True)
    return df

###
# Function to scrape all courses within a department, all courses if no department specified
# @param department code (optional)
# @return dataframe of course data and reviews
###
def scrape_courses(dept_code = None):
    
    # create dataframe
    col_names = ['Course Code', 'Course Name', 'Department', 'Professor', 'Course Ratings', 'Quarter', 'Year', 'Grade', 'Review Date', 'Review Text', 'Review Upvote', 'Review Downvote']
    df = pd.DataFrame(columns = col_names)
    
    # get courses
    courses = get_courses(dept_code)
    
    # iterate through all courses and scrape reviews
    for i in courses:
        df = pd.concat([df, scrape_reviews(i)]).reset_index(drop = True)
        
    # drop duplicates
    df = df.drop_duplicates(keep = 'first').reset_index(drop = True)
    return df

In [31]:
# scrape for stats 112
df = scrape_reviews('stats-112')
df

Unnamed: 0,Course Code,Course Name,Department,Professor,Course Ratings,Quarter,Year,Grade,Review Date,Review Text,Review Upvote,Review Downvote
0,STATS 112,Statistical Methods for Social Sciences,Statistics,Robert L Gould,"{'Overall': 2.8, 'Users': 8.0, 'Easiness': 1.5...",,,,11/22/2008,"Ok, those giving him harsh reviews need to cal...",0,0
1,STATS 112,Statistical Methods for Social Sciences,Statistics,Robert L Gould,"{'Overall': 2.8, 'Users': 8.0, 'Easiness': 1.5...",,,,11/30/2007,Professor Gould generally appears not to know ...,1,0
2,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",Fall,2018.0,,12/14/2018,Grading breakdown:(15%) Four group homeworks &...,2,0
3,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",Fall,2020.0,B+,01/22/2021,this class is very divided between two groups ...,0,0
4,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",Fall,2018.0,A,12/15/2018,I agree with most part of the comment below ex...,1,0
5,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",,,,12/14/2015,"Stat112simply, the worst professor in UCLA.too...",0,0
6,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",,,,06/05/2013,she spell Z-score to Z-sport on the exam. that...,1,0
7,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",,,,06/23/2012,"I took her stat 10 class in fall 2011, got B+....",0,0
8,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",,,,06/13/2012,"Nice Professor, if you go to the lecture, and ...",0,0
9,STATS 112,Statistical Methods for Social Sciences,Statistics,Mahtash M Esfandiari,"{'Overall': 3.0, 'Users': 24.0, 'Easiness': 3....",,,,12/13/2010,She was really nice but the lectures were bori...,0,0


In [32]:
# scrape for stats department, code is 176
df = scrape_courses('176')
df

Unnamed: 0,Course Code,Course Name,Department,Professor,Course Ratings,Quarter,Year,Grade,Review Date,Review Text,Review Upvote,Review Downvote
0,STATS 19,Fiat Lux Freshman Seminars: Statistics and Por...,Statistics,Nicolas Christou,"{'Overall': 5.0, 'Users': 1.0, 'Easiness': 5.0...",Fall,2021,P,12/13/2021,"This class, taught in fall 2021, requires stud...",1,0
1,STATS 199,Directed Research in Statistics,Statistics,Michael Tsiang,"{'Overall': 5.0, 'Users': 1.0, 'Easiness': 4.0...",Winter,2021,A+,03/29/2021,"Note: This review is for Winter 2021, a quarte...",0,0
2,STATS 21,Python and Other Technologies for Data Science,Statistics,Miles Satori Chen,"{'Overall': 4.8, 'Users': 8.0, 'Easiness': 4.4...",Winter,2022,B+,03/11/2022,** The quarter I took this class was partially...,1,0
3,STATS 21,Python and Other Technologies for Data Science,Statistics,Miles Satori Chen,"{'Overall': 4.8, 'Users': 8.0, 'Easiness': 4.4...",Fall,2022,NR,12/07/2022,The best Stats professor at UCLA.,0,0
4,STATS 21,Python and Other Technologies for Data Science,Statistics,Miles Satori Chen,"{'Overall': 4.8, 'Users': 8.0, 'Easiness': 4.4...",Fall,2022,,12/03/2022,Legend. Best Professor I had so far.,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1443,STATS 100B,Introduction to Mathematical Statistics,Statistics,Juana Sanchez,"{'Overall': 2.5, 'Users': 16.0, 'Easiness': 2....",,,,03/27/2006,Sanchez is not an effective professor. She mak...,0,0
1444,STATS 100B,Introduction to Mathematical Statistics,Statistics,Hongquan Xu,"{'Overall': 3.1, 'Users': 11.0, 'Easiness': 2....",Winter,2019,A,07/24/2019,"This class was a rollercoaster tbh. Of course,...",0,0
1445,STATS 100B,Introduction to Mathematical Statistics,Statistics,Hongquan Xu,"{'Overall': 3.1, 'Users': 11.0, 'Easiness': 2....",Winter,2018,A,03/27/2018,I took Stats 100B with Professor Xu and it was...,1,0
1446,STATS 100B,Introduction to Mathematical Statistics,Statistics,Hongquan Xu,"{'Overall': 3.1, 'Users': 11.0, 'Easiness': 2....",,,,12/03/2015,"Since he used to be in pure math major,he trie...",0,1


In [34]:
from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis")

# sentiment analysis on review text, label and score using hugging face
for i in range(len(df)):
    
    text = df.at[i, 'Review Text']
    # if greater than character limit truncate down to max
    if len(text) > 512:
        text = text[:512]
    
    # perform sentiment analysis
    sentiment_results = sentiment_analysis(text)
    df.at[i, 'Review Sentiment Label'] = sentiment_results[0]["label"]
    df.at[i, 'Review Sentiment Score'] = sentiment_results[0]["score"]

In [33]:
#export dataframe to csv
df.to_csv('bruinwalk_stats.csv', index = False)

In [None]:
# scrape for all departments and courses
df = scrape_courses()
df