** Data Collection **

In [67]:
import requests
from bs4 import BeautifulSoup

# converts profile page to list of line strings
def scrape_profile(profile_html):
    
    teacher = requests.get(profile_html)
    teacher_soup = BeautifulSoup(teacher.content, "html.parser")
    return teacher_soup.find_all(text = True)

In [65]:
# returns a list of classes taught
# by the teacher in the profile
def get_classes(data):
    
    classes = []
    i= 133; 
    
    first_class = data[133]
    classes.append(first_class)
    next_class = data[137]
    
    while(next_class != '\n'):
        classes.append(next_class)
        i += 4
        next_class = data[i]
    
    if(len(classes) == 1):
        return (classes, i + 7)
    
    return(classes, i + 3)

In [76]:
# collect the review information here
import pandas as pd

def get_reviews(data, class_list, review_index):
    
    standings = ["Freshman", "Sophomore", "Junior", "Senior", "5th/6th Year Senior", "Grad Student"]
    
    reviews = { 'teacher_name': [], 'teacher_field': [], 'teacher_rating': [], 'teacher_difficulties': [], 'teacher_presentaion': [],
               'student_class': [], 'student_standing': [], 'student_grade': [], 'student_major': [], 
               'review_month': [], 'review_year': [], 'review_content': [] }
    
    while(review_index < len(data)):
        
        # class information
        if(data[review_index] in class_list):
            reviews["student_class"].append(data[review_index])
            review_index += 3
        else:
            reviews["student_class"].append(reviews["student_class"][-1])
        
        # teacher information
        reviews["teacher_name"].append(data[111])
        reviews["teacher_field"].append(data[117])
        reviews["teacher_rating"].append(float(data[118].split()[-1][0:4]))
        reviews["teacher_difficulties"].append(float(data[120].split()[-1])) # recognizes student difficulties
        reviews["teacher_presentaion"].append(float(data[121].split()[-1]))  
        
        # student information
        reviews["student_standing"].append(data[review_index].replace("\n", ''))
        reviews["student_grade"].append(data[review_index + 1].replace("\n", ''))
        reviews["student_major"].append(data[review_index + 2].replace("\n", ''))
        
        # review information
        time = data[review_index + 3].replace("\n", '').split()
        reviews["review_month"].append(time[0])
        reviews["review_year"].append(int(time[1]))
        
        review_index += 6 # offset to the beginning of the review

        review = ""       # default review is blank to account for empty reviews
        
        review_append = data[review_index].replace("\n",'')

        while((review_append not in class_list) and (review_append not in standings)):    #reviews are deliminated by class name
            
            if(review_append != ""):
                review = review + review_append
                
            review_index += 1
            
            if(review_index == (len(data) - 1)):
                reviews["review_content"].append(review)
                return pd.DataFrame(data = reviews)
            
            review_append = data[review_index].replace("\n",'')

        reviews["review_content"].append(review)

In [73]:
# returns a dataframe of a Polyrating's profile page
def get_profile_df(profile_html):

    teacher_data = scrape_profile(profile_html)
    (class_list, next_index) = get_classes(teacher_data)
    return get_reviews(teacher_data, class_list, next_index)

** Example Profile Pages: Severus Snape **

In [78]:
snape = get_profile_df("http://polyratings.com/eval.php?profid=3485")

In [79]:
snape

Unnamed: 0,review_content,review_month,review_year,student_class,student_grade,student_major,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
0,"Snape, Snape, Severus Snape.",May,2014,CHEM 101,B,General Ed,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
1,This professor is the most amazing teacher in ...,Mar,2014,CHEM 110,A,Elective,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
2,I tried really hard in his class but he seemed...,May,2015,CHEM 124,D,Required (Major),Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
3,"Very cramped handwriting, difficult to read on...",Jan,2018,CHEM 124,C,Required (Support),Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
4,Coming in to the class I had heard some things...,Jan,2015,CHEM 125,C,Elective,Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
5,I really wanted to get him for defense against...,May,2016,CHEM 125,B,Required (Major),Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
6,"Prof. Snape knows his chemistry, but seems per...",Jun,2015,CHEM 129,C,Required (Support),Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
7,Professor Snape is a fantastic professor. Bare...,Jan,2018,CHEM 202,A,General Ed,Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
8,"He can teach you how to bewitch the mind, and ...",Mar,2014,CHEM 211,B,General Ed,Freshman,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17
9,I miss him. Always.,Jan,2016,CHEM 216,B,Required (Major),Sophomore,2.33,Chemistry and Biochemistry,"Snape, Severus",3.33,3.17


** Edge Case Example: One Review **

In [74]:
get_profile_df("http://polyratings.com/eval.php?profid=468")

Unnamed: 0,review_content,review_month,review_year,student_class,student_grade,student_major,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
0,What the guy/gal below said. Amen. I too took ...,Nov,2016.0,MATE 128,A,Elective,Senior,0.0,Mechanical Engineering,"Mussulman, Ronald",1.0,2.0


** Large Profile (106 Reviews) **

In [77]:
get_profile_df("http://polyratings.com/eval.php?profid=482")[-10: -1]

Unnamed: 0,review_content,review_month,review_year,student_class,student_grade,student_major,student_standing,teacher_difficulties,teacher_field,teacher_name,teacher_presentaion,teacher_rating
95,I learned absolutely NOTHING in this class. E...,Mar,2009,SPC 145,B,General Ed,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
96,Nolan's class was a joke. If you are Catholic ...,Mar,2009,SPC 145,B,General Ed,Junior,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
97,"Nolan teaches his ""argumentation"" class as if ...",Mar,2009,SPC 145,B,General Ed,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
98,I thought he was a very nice guy with interest...,Apr,2009,SPC 145,A,General Ed,Freshman,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
99,I thought COMS 145 was really easy. It is a mu...,Dec,2009,SPC 145,A,General Ed,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
100,Mr Nolan is the man. This is class was really ...,Feb,2010,SPC 145,B,General Ed,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
101,It is hard not to think a lot of what Nolan sa...,Dec,2010,SPC 145,,General Ed,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
102,Nolan might be the laziest teacher I have ever...,May,2011,SPC 145,,General Ed,Sophomore,2.84,Communication Studies,"Nolan, Frank",2.92,3.01
103,"Besides always talking about his ""Life Partner...",Aug,2001,SPC 201,A,General Ed,Freshman,2.84,Communication Studies,"Nolan, Frank",2.92,3.01


** Now to collect them all! **

In [82]:
from bs4 import BeautifulSoup
import bs4
import requests
import time

def find_teacher_url(li):
    try:
        start = li.index("\"<a class=\"no-link-highlight text-muted filterable\" href=\"") + 1
        end = li.index(" ", start )
        return li[start:end]
    except ValueError:
        return 0

# returns a list of all the profile URLs on Polyratings
def get_list(directory_URL):
    
    profile_urls = []
    href = 0;
    
    page = requests.get(URL)
    html = BeautifulSoup(page.text, "html.parser")
    b52_songs = html.find("ul", class_="no-link-highlight text-muted filterable")

    for li in b52_songs:
        href = find_href(str(li))
        if(href != 0):
            song_urls.append(href.replace('"', ''))

In [None]:
get_list("http://polyratings.com/list.php")

In [88]:
directory_url = "http://polyratings.com/list.php"
directory = requests.get(directory_url)

In [None]:
from bs4 import BeautifulSoup
import bs4
import requests
import time

profile_urls = []
href = 0;
directory_URL = http://polyratings.com/list.php

def find_teacher_url(li):
    try:
        start = li.index("\"<a class=\"no-link-highlight text-muted filterable\" href=\"") + 1
        end = li.index(" ", start )
        return li[start:end]
    except ValueError:
        return 0

page = requests.get(URL)
html = BeautifulSoup(page.text, "html.parser")
b52_songs = html.find("ul", class_="no-link-highlight text-muted filterable")

for li in b52_songs:
    href = find_href(str(li))
    if(href != 0):
        song_urls.append(href.replace('"', ''))