In [41]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

import time
import json
import requests
import re

import pandas as pd

In [93]:
COLLEGE_UL = '(//ul[@class="subsubCategoryItems"])[6]'
APP_REQS = '//a[text()="Application Requirements"]'
FINAID = '//a[text()="Financial Aid"]'
ACADEMICS = '//a[text()="Academics"]'
ACADEMIC_HIGHLIGHTS = '//section[@class="lcol col-sm-12 col-md-7"]'

class QBScraper:
    def __init__(self, url):
        self.driver = None
        self.url = url
    
    def create_driver(self):
        self.driver = webdriver.Chrome()
        self.driver.get(self.url)
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 10)

    def scrape_colleges(self):
        college_ul = self.wait.until(EC.presence_of_element_located((By.XPATH, COLLEGE_UL)))
        college_li_items = college_ul.find_elements(By.TAG_NAME, 'li')

        # get links
        links = []
        for item in college_li_items:
            # get link
            link = item.find_element(By.TAG_NAME, 'a')
            href = link.get_attribute('href')
            links.append(href)
        
        print(links)
        
        college_json = {}
        for link in links:
            self.driver.get(link)

            college_name = self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'h1'))).get_attribute('innerText')

            # navigate to app reqs
            application_reqs_tab = self.wait.until(EC.presence_of_element_located((By.XPATH, APP_REQS)))
            application_reqs_tab.click()

            # check if page is blank
            college_info = {}
            if len(self.driver.find_elements(By.TAG_NAME, 'table')) != 0:
                # scrape application requirements
                app_reqs_dict = self.find_application_requirements()

                # navigate to match reqs
                post_match_dict = self.find_post_match_options()

                application_requirements = {
                    'Match Requirements': app_reqs_dict,
                    'Post-Match Options': post_match_dict
                }
                college_info['Application Info'] = application_requirements

            # navigate to aid tab
            finaid_tab = self.wait.until(EC.presence_of_element_located((By.XPATH, FINAID)))
            finaid_tab.click()

            # check if page is blank
            if len(self.driver.find_elements(By.TAG_NAME, 'table')) != 0:
                # scrape aid data
                college_info['Financial Aid Data'] = self.scrape_aid_data()
            
            # navigate to academics tab
            academics_tab = self.wait.until(EC.presence_of_element_located((By.XPATH, ACADEMICS)))
            academics_tab.click()

            # scrape academic highlights
            academic_highlights = self.scrape_academic_highlights()
            college_info['Academic Highlights'] = academic_highlights
            college_json[college_name] = college_info
        with open ('questbridge_data.json', 'w') as file:
            json.dump(college_json, file, indent=4)
    
    def find_application_requirements(self):
        app_reqs_dict = {}

        # navigate to match reqs table
        match_reqs_table = self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
        rows = match_reqs_table.find_elements(By.TAG_NAME, 'tr')
        
        # get the match requirements deadline
        row_header = rows[0].find_element(By.TAG_NAME, 'th').get_attribute('innerText')
        deadline = row_header.split('\n')[-1].strip().split(':')[-1].strip()
        app_reqs_dict['Deadline'] = deadline

        # iterate over the rest of the rows
        for row in rows[1:]:
            # check if we're at the three-column row
            if row.find_element(By.TAG_NAME, 'td').get_attribute('colspan') != '3':
                # TODO: swap above to == 3 so that we can portal activation data

                # check if we're looking at rows with codes, since we want the codes
                second_col = row.find_elements(By.TAG_NAME, 'td')[1]
                if (len(second_col.find_elements(By.TAG_NAME, 'p')) > 1):
                    codes = second_col.find_elements(By.TAG_NAME, 'p')[1].get_attribute('innerText')
                    for code in codes.split('\n'):
                        code = code.strip()
                        if 'ACT' in code:
                            code = ''.join(re.findall(r'\d+', code))
                            app_reqs_dict['ACT Code'] = code
                        elif 'SAT' in code:
                            code = ''.join(re.findall(r'\d+', code))
                            app_reqs_dict['SAT Code'] = code
                        elif 'CSS' in code:
                            code = ''.join(re.findall(r'\d+', code))
                            app_reqs_dict['CSS Code'] = code
                        elif 'FAFSA' in code:
                            code = ''.join(re.findall(r'\d+', code))
                            app_reqs_dict['FAFSA Code'] = code

                    key = second_col.find_element(By.TAG_NAME, 'p').get_attribute('innerText').replace('\xa0', ' ').replace('\n', ' ').strip()
                    value = row.find_elements(By.TAG_NAME, 'td')[2].find_element(By.TAG_NAME, 'p').get_attribute('innerText').replace('\xa0', ' ').replace('\n', ' ').strip()
                    app_reqs_dict[key] = value
                
                # else if we're not looking at a row with codes? just grab the second col as the key and the third col as the value for the kv pair in dict
                else:
                    key = second_col.get_attribute('innerText').replace('\xa0', ' ').replace('\n', ' ').strip()
                    value = row.find_elements(By.TAG_NAME, 'td')[2].find_element(By.TAG_NAME, 'p').get_attribute('innerText').replace('\xa0', ' ').replace('\n', ' ').strip()
                    app_reqs_dict[key] = value
        return app_reqs_dict

    def find_post_match_options(self):
        post_match_dict = {}

        table = self.driver.find_elements(By.TAG_NAME, 'table')[1]
        rows = table.find_elements(By.TAG_NAME, 'tr')
        for row in rows[1:]:
            key = row.find_element(By.TAG_NAME, 'p').get_attribute('innerText')
            value = row.find_elements(By.TAG_NAME, 'td')[1].find_element(By.TAG_NAME, 'p').get_attribute('innerText')
            post_match_dict[key] = value
        
        return post_match_dict
    
    def scrape_aid_data(self):
        aid_dict = {}
        cost_dict = {}
        cover_dict = {}

        cost_table = self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
        rows = cost_table.find_elements(By.TAG_NAME, 'tr')
        for row in rows[1:]:
            key = row.find_element(By.TAG_NAME, 'p').get_attribute('innerText')
            value = row.find_elements(By.TAG_NAME, 'td')[1].get_attribute('innerText')
            cost_dict[key] = value
        
        aid_table = self.driver.find_elements(By.TAG_NAME, 'table')[1]
        rows = aid_table.find_elements(By.TAG_NAME, 'tr')
        for row in rows[1:]:
            key = row.find_element(By.TAG_NAME, 'p').get_attribute('innerText')
            value = row.find_elements(By.TAG_NAME, 'td')[1].get_attribute('innerText')
            cover_dict[key] = value
        
        aid_dict['Costs of Attendance'] = cost_dict
        aid_dict['How Costs are Covered'] = cover_dict
        return aid_dict
    
    def scrape_academic_highlights(self):
        academic_highlights_arr = []
        ul = self.wait.until(EC.presence_of_element_located((By.XPATH, ACADEMIC_HIGHLIGHTS))).find_element(By.TAG_NAME, 'ul')
        list_items = ul.find_elements(By.TAG_NAME, 'li')
        for item in list_items:
            value = item.find_element(By.TAG_NAME, 'p').get_attribute('innerText').replace('\xa0', ' ')
            academic_highlights_arr.append(value)
        return academic_highlights_arr        

In [94]:
scraper = QBScraper('https://www.questbridge.org/college-partners')
scraper.create_driver()
scraper.scrape_colleges()

['https://www.questbridge.org/college-partners/colorado-college', 'https://www.questbridge.org/college-partners/columbia-university', 'https://www.questbridge.org/college-partners/cornell-university', 'https://www.questbridge.org/college-partners/dartmouth-college', 'https://www.questbridge.org/college-partners/davidson-college', 'https://www.questbridge.org/college-partners/denison-university', 'https://www.questbridge.org/college-partners/duke-university', 'https://www.questbridge.org/college-partners/emory-university', 'https://www.questbridge.org/college-partners/grinnell-college', 'https://www.questbridge.org/college-partners/hamilton-college', 'https://www.questbridge.org/college-partners/haverford-college', 'https://www.questbridge.org/college-partners/johns-hopkins-university', 'https://www.questbridge.org/college-partners/macalester-college']
