# I. Data Collection

In [16]:
import requests
from bs4 import BeautifulSoup

# converts profile page to list of line strings
def scrape_profile(profile_html):
    
    teacher = requests.get(profile_html)
    teacher_soup = BeautifulSoup(teacher.content, "html.parser")
    return teacher_soup.find_all(text = True)

In [17]:
# returns a list of classes taught
# by the teacher in the profile
def get_classes(data):
    
    # check for hidden/removed profiles
    if(data[110] == 
       "\nYou have requested an invalid professor evaluation; this could be because:\n"):
        return (None, 0)
    
    # check for N/A total scores (can not be used for machine learning model)
    # this also changes the offset values
    if(data[118][0:3] == 'N/A'):
        return (None, 0)
    
    classes = []
    i= 133; 
    
    first_class = data[133]
    classes.append(first_class)
    next_class = data[137]
    
    while(next_class != '\n'):
        classes.append(next_class)
        i += 4
        next_class = data[i]
    
    if(len(classes) == 1):
        return (classes, i + 7)
    
    return(classes, i + 3)

In [18]:
# collect the review information here
# returns a DataFrame of the data
import pandas as pd

def get_reviews(data, class_list, review_index):
    
    standings = ["Freshman", "Sophomore", "Junior", "Senior", "5th Year Senior", "5th/6th Year Senior", "Graduate Student"]
    
    rank = { "Freshman": 0, "Sophomore": 1, "Junior": 2, "Senior": 3, "5th Year Senior": 4, "5th/6th Year Senior": 4, "Graduate Student": 5}
    
    grades = { 'A': 4, 'B': 3, 'C': 2, 'D': 1, 'F': 0, 'N/A': None, 'Credit': None, 'No Credit': None, 'Withdrawn': None }
    
    reviews = { 'class': [], 'class_abrv': [], 'class_number': [], 
               'teacher_name': [], 'teacher_field': [], 'teacher_rating': [], 'teacher_difficulties': [], 'teacher_presentaion': [],
               'student_standing': [], 'student_rank': [], 'student_grade': [], 'student_gpa': [], 'student_major': [], 
               'review_month': [], 'review_year': [], 'review_content': [] }
    
    while(review_index < len(data)):
        
        # class information
        if(data[review_index] in class_list):
            reviews["class"].append(data[review_index])
            review_index += 3
        else:
            reviews["class"].append(reviews["class"][-1])
        
        student_class = reviews["class"][-1].split()
        reviews["class_abrv"].append(student_class[0])
        reviews["class_number"].append(int(student_class[1]))
        
        # teacher information
        reviews["teacher_name"].append(data[111])
        reviews["teacher_field"].append(data[117])
        reviews["teacher_rating"].append(float(data[118].split()[-1][0:4]))
        reviews["teacher_difficulties"].append(float(data[120].split()[-1])) # recognizes student difficulties
        reviews["teacher_presentaion"].append(float(data[121].split()[-1]))  
        
        # student information
        standing = data[review_index].replace("\n", '')
        reviews["student_standing"].append(standing)
        reviews["student_rank"].append(rank[standing])
        grade_letter = data[review_index + 1].replace("\n", '')
        reviews["student_grade"].append(grade_letter)
        reviews["student_gpa"].append(grades[grade_letter])
        reviews["student_major"].append(data[review_index + 2].replace("\n", ''))
        
        # review information
        time = data[review_index + 3].replace("\n", '').split()
        reviews["review_month"].append(time[0])
        reviews["review_year"].append(int(time[1]))
        
        review_index += 6 # offset to the beginning of the review

        review = ""       # default review is blank to account for empty reviews
        
        review_append = data[review_index].replace("\n",'').replace('\r', '')

        while((review_append not in class_list) and (review_append not in standings)):    #reviews are deliminated by class name
            
            if(review_append != ""):
                review = review + review_append
                
            review_index += 1
            
            if(review_index == (len(data) - 1)):
                reviews["review_content"].append(review)
                return pd.DataFrame(data = reviews)
            
            review_append = data[review_index].replace("\n",'').replace('\r', '')

        reviews["review_content"].append(review)

In [19]:
# returns a dataframe of a Polyrating's profile page
def get_profile_df(profile_html):

    teacher_data = scrape_profile(profile_html)
    (class_list, next_index) = get_classes(teacher_data)
    if(next_index == 0):  # handles hidden pages
        return None
    return get_reviews(teacher_data, class_list, next_index)

## Example Profile Pages: Severus Snape

In [20]:
snape = get_profile_df("http://polyratings.com/eval.php?profid=3485")

In [21]:
snape

Unnamed: 0,class,class_abrv,class_number,review_content,review_month,review_year,student_gpa,student_grade,student_major,student_rank,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
0,CHEM 101,CHEM,101,"Snape, Snape, Severus Snape.",May,2014,3.0,B,General Ed,0,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
1,CHEM 110,CHEM,110,This professor is the most amazing teacher in ...,Mar,2014,4.0,A,Elective,0,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
2,CHEM 124,CHEM,124,I tried really hard in his class but he seemed...,May,2015,1.0,D,Required (Major),0,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
3,CHEM 124,CHEM,124,"Very cramped handwriting, difficult to read on...",Jan,2018,2.0,C,Required (Support),0,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
4,CHEM 125,CHEM,125,Coming in to the class I had heard some things...,Jan,2015,2.0,C,Elective,1,Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
5,CHEM 125,CHEM,125,I really wanted to get him for defense against...,May,2016,3.0,B,Required (Major),1,Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
6,CHEM 129,CHEM,129,"Prof. Snape knows his chemistry, but seems per...",Jun,2015,2.0,C,Required (Support),0,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
7,CHEM 202,CHEM,202,Professor Snape is a fantastic professor. Bare...,Jan,2018,4.0,A,General Ed,1,Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
8,CHEM 211,CHEM,211,"He can teach you how to bewitch the mind, and ...",Mar,2014,3.0,B,General Ed,0,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
9,CHEM 216,CHEM,216,I miss him. Always.,Jan,2016,3.0,B,Required (Major),1,Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17


## Edge Case Example: One Review

In [22]:
get_profile_df("http://polyratings.com/eval.php?profid=468")

Unnamed: 0,class,class_abrv,class_number,review_content,review_month,review_year,student_gpa,student_grade,student_major,student_rank,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
0,MATE 128,MATE,128,What the guy/gal below said. Amen. I too took ...,Nov,2016,4,A,Elective,3,Senior,0.0,Mechanical Engineering,"Mussulman, Ronald",1.0,2.0


## Large Profile (106 Reviews)

In [8]:
get_profile_df("http://polyratings.com/eval.php?profid=482")[-10: -1]

Unnamed: 0,class,class_abrv,class_number,review_content,review_month,review_year,student_gpa,student_grade,student_major,student_rank,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
96,SPC 145,SPC,145,I learned absolutely NOTHING in this class. E...,Mar,2009,3.0,B,General Ed,1,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
97,SPC 145,SPC,145,Nolan's class was a joke. If you are Catholic ...,Mar,2009,3.0,B,General Ed,2,Junior,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
98,SPC 145,SPC,145,"Nolan teaches his ""argumentation"" class as if ...",Mar,2009,3.0,B,General Ed,1,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
99,SPC 145,SPC,145,I thought he was a very nice guy with interest...,Apr,2009,4.0,A,General Ed,0,Freshman,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
100,SPC 145,SPC,145,I thought COMS 145 was really easy. It is a mu...,Dec,2009,4.0,A,General Ed,1,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
101,SPC 145,SPC,145,Mr Nolan is the man. This is class was really ...,Feb,2010,3.0,B,General Ed,1,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
102,SPC 145,SPC,145,It is hard not to think a lot of what Nolan sa...,Dec,2010,,,General Ed,1,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
103,SPC 145,SPC,145,Nolan might be the laziest teacher I have ever...,May,2011,,,General Ed,1,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
104,SPC 201,SPC,201,"Besides always talking about his ""Life Partner...",Aug,2001,4.0,A,General Ed,0,Freshman,2.84,Communication Studies,"Nolan, Frank",2.92,3.01


## Collecting Profile URLs

In [23]:
from bs4 import BeautifulSoup
import bs4
import requests
import time

def find_href(li):
    try:
        start = li.index("href=\"") + len("href=\"")
        end = li.index("\"", start )
        return li[start:end]
    except ValueError:
        return 0

# returns a list of all the profile URLs on Polyratings
def get_list(soup):
    
    profile_urls = []
    href = 0;
    
    profile_headers = soup.find_all("a", class_="no-link-highlight text-muted filterable")

    for header in profile_headers:
        href = find_href(str(header))
        if(href != 0):
            profile_urls.append(href.replace('"', ''))
    
    return profile_urls

In [24]:
# request the directory page
page = requests.get("http://polyratings.com/list.php") #directory of PolyRatings
soup = BeautifulSoup(page.text, "html.parser")

In [25]:
profile_urls = get_list(soup)

In [26]:
len(profile_urls)

2478

## Final Collection

In [None]:
# beware! ~30 minutes
import pickle
import time
import sys

sys.setrecursionlimit(1000000000)

page = 0

# evaluating one page at a time to handle errors if they arise
for url in profile_urls:
    prof = get_profile_df(url)
    if(prof is not None):
        name = prof.iloc[0]["teacher_name"].replace(', ', '_').lower()
        prof.to_csv("polyratings_profiles/" + name + ".csv")
    page += 1
    time.sleep(0.1)

## One Large DataFrame

In [28]:
import os
import pandas as pd

teacher_df = []

for teacher in os.listdir('polyratings_profiles'):
    path = "polyratings_profiles/" + teacher
    if(path.endswith(".csv")):
        teacher_df.append(pd.read_csv(path))

In [29]:
reviews = pd.concat(teacher_df)

In [30]:
reviews["student_gpa"] = reviews.student_gpa.astype("float")

In [31]:
reviews.dtypes

Unnamed: 0                int64
class                    object
class_abrv               object
class_number              int64
review_content           object
review_month             object
review_year               int64
student_gpa             float64
student_grade            object
student_major            object
student_rank              int64
student_standing         object
teacher_difficulties    float64
teacher_field            object
teacher_name             object
teacher_presentaion     float64
teacher_rating          float64
dtype: object

In [32]:
reviews.to_csv("polyratings_reviews.csv")

## Check

In [43]:
test = pd.read_csv("polyratings_reviews.csv")

In [49]:
test.drop(test[["Unnamed: 0", "Unnamed: 0.1"]], axis = 1)

Unnamed: 0,class,class_abrv,class_number,review_content,review_month,review_year,student_gpa,student_grade,student_major,student_rank,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
0,CPE 215,CPE,215,"he knows what he's talking about, he knows how...",Aug,2001,4.0,A,Required (Major),1,Sophomore,3.64,Computer Science,"Connely, John",3.45,3.41
1,CPE 215,CPE,215,"In two years being here at Cal Poly, Dr Connel...",Aug,2001,4.0,A,Required (Major),1,Sophomore,3.64,Computer Science,"Connely, John",3.45,3.41
2,CPE 215,CPE,215,Connely's an excellent professor. I've had hi...,Aug,2001,3.0,B,Required (Major),2,Junior,3.64,Computer Science,"Connely, John",3.45,3.41
3,CPE 215,CPE,215,"This guy knows what he is talking about, and h...",Aug,2001,4.0,A,Required (Major),1,Sophomore,3.64,Computer Science,"Connely, John",3.45,3.41
4,CPE 215,CPE,215,I could go on and on about how great of a teac...,Dec,1999,4.0,A,Required (Support),2,Junior,3.64,Computer Science,"Connely, John",3.45,3.41
5,CPE 215,CPE,215,Dr. Connely is a great teacher and he conveys ...,Feb,2000,3.0,B,Required (Support),4,5th Year Senior,3.64,Computer Science,"Connely, John",3.45,3.41
6,CPE 215,CPE,215,Dr.Connely is the best teacher I've had for al...,Feb,2001,4.0,A,Required (Support),1,Sophomore,3.64,Computer Science,"Connely, John",3.45,3.41
7,CSC 317,CSC,317,you learn C and some unix stuff. he has a nic...,Dec,2002,4.0,A,Required (Major),2,Junior,3.64,Computer Science,"Connely, John",3.45,3.41
8,CSC 317,CSC,317,Dr. Connely is one of the best professors I've...,Jan,2003,4.0,A,Elective,3,Senior,3.64,Computer Science,"Connely, John",3.45,3.41
9,CSC 317,CSC,317,CPE x317 was a worthwhile introduction to the ...,Mar,2003,3.0,B,Required (Support),2,Junior,3.64,Computer Science,"Connely, John",3.45,3.41
