In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

import time
import json

In [51]:
DRIVER_PATH = "../driver/chromedriver"

collegedata_general = {}

importance = ["Very Important", "Important", "Considered", "Not Considered"]

## Scraping General Function

In [100]:
"""
scrape function:
1. access each college page using college_links.json
2. wait for 2 seconds
3. scrape each page (find the button on the navbar and click)
    - wait for 1 second after click
4. wrap up in json format and close
"""

def get_general_college_data(college_name, college_link, sleep=0.5):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    cur_college_data = {}

    # Admission Page

    driver.get(college_link + "/admission")
    time.sleep(sleep)
    admission_body = WebDriverWait(driver, sleep).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # cur_college_data["admission"] = get_admission(admission_body, 0.5)

    # Financials Page

    driver.get(college_link + "/money-matters")
    time.sleep(sleep)
    financials_body = WebDriverWait(driver, sleep).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    print(get_financials(financials_body, 0.5))

    # Academics Page

    driver.get(college_link + "/academics")
    time.sleep(sleep)
    academics_body = WebDriverWait(driver, sleep).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # Campus Life page

    driver.get(college_link + "/campus-life")
    time.sleep(sleep)
    campus_life_body = WebDriverWait(driver, sleep).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # Students Page

    driver.get(college_link + "/students")
    time.sleep(sleep)
    students_body = WebDriverWait(driver, sleep).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # quit
    driver.quit()
    collegedata_general[college_name] = cur_college_data


## Admission page function

In [98]:
def get_admission(admission_body, sleep=0.5):
    admission_info = {}

    all_info = admission_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Freshman admission requirements

    admission_info["high_school_req_rec_data"] = {}

    high_school_req_rec = admission_body.find_element(By.CSS_SELECTOR, value=".Table_unlined__29Tmx > tbody")
    high_school_req_rec_table = high_school_req_rec.find_elements_by_css_selector("tr")
    for unit in high_school_req_rec_table:
        elements = unit.find_elements_by_css_selector("td")
        admission_info["high_school_req_rec_data"][elements[0].text] = {
            "required": int(elements[1].text) if elements[1].text != '' else None,
            "recommended": int(elements[2].text) if elements[2].text != '' else None
        }

    time.sleep(sleep)

    # Applying for admission

    apply_adm = all_info[1]
    adm_contents = apply_adm.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for adm_content in adm_contents:
        try:
            title = adm_content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            info = adm_content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            admission_info[title] = info
        except:
            continue

    # Selection of students

    admission_info["selection_of_students"] = {}
    
    selection_of_students_table = all_info[2].find_element(By.TAG_NAME, value="tbody")
    metrics = selection_of_students_table.find_elements(By.TAG_NAME, value="tr")

    for metric in metrics:
        elements = metric.find_elements(By.TAG_NAME, value="td")
        index = 0
        metric_name = ""
        for element in elements:
            if index == 0:
                metric_name = element.text
            else:
                if element.text == "X":
                    admission_info["selection_of_students"][metric_name] = importance[index - 1]
                    break
            index += 1
    
    # Profile of Fall Application

    fall_app_prof = all_info[3]
    prof_contents = fall_app_prof.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for prof_content in prof_contents:
        try:
            text_title = prof_content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            text_info = prof_content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            admission_info[text_title] = text_info
        except:
            pass

        try:
            histogram = prof_content.find_element(By.CLASS_NAME, value="Histogram_container__12lhQ")
        except:
            continue

        try:
            histogram_title = prof_content.find_element(By.CLASS_NAME, value="CollegeProfileContent_barGraphTitle__1uzxw").text
            print(histogram_title)
            histogram_title_list = histogram_title.split(": ")
            test_title = ""
            overall_intro = ""
            if len(histogram_title_list) == 1:
                test_title = "ACT"
                overall_intro = histogram_title_list[0]
            else:
                [test_title, overall_intro] = histogram_title_list
            admission_info[test_title] = {
                "overall": overall_intro,
                "data": histogram_scrape(prof_content)
            }
            # print(admission_info)
        except:
            admission_info["GPA"] = histogram_scrape(prof_content)

    return admission_info

# histogram process function
def histogram_scrape(prof_content):
    histogram_data = {}
    histogram = prof_content.find_element(By.CLASS_NAME, value="Histogram_container__12lhQ")
    histogram_attr = histogram.find_elements(By.CLASS_NAME, value="Histogram_label__1j_Ku")
    histogram_val = histogram.find_elements(By.CLASS_NAME, value="Histogram_value__3cCAB")
    for each_attr, each_val in zip(histogram_attr, histogram_val):
        histogram_data[each_attr.text] = each_val.text
    return histogram_data

## Financials page function

In [148]:
def get_financials(financials_body, sleep=0.5):
    financials_info = {}

    all_info = financials_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Applying for financial aid

    fin_app_contents = all_info[0].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in fin_app_contents:
        try:
            fin_app_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            try:
                link_info = content.find_element(By.CLASS_NAME, value="TitleValue_link__1veWn")
                financials_info[fin_app_title] = link_info.get_attribute('href')
            except:
                financials_info[fin_app_title] = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
        except:
            pass

        try:
            fin_app_table = content.find_element(By.CLASS_NAME, value="Table_unlined__29Tmx")
            financials_info["forms required"] = {}
            table_elements = fin_app_table.find_elements(By.CSS_SELECTOR, value="tbody > tr")
            for table_element in table_elements:
                table_element_detail = table_element.find_elements(By.TAG_NAME, value="td")
                financials_info["forms required"][table_element_detail[0].text] = table_element_detail[1].text
        except:
            continue
    
    # Profile of 19-20 financial aid

    aid_19_contents = all_info[1].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    fresh_man_contents = aid_19_contents[0:8]
    undergrad_contents = aid_19_contents[8:16]

    financials_info["freshman_19_20_profile"] = {}
    financials_info["undergraduate_19_20_profile"] = {}

    profile_financial_aid_scrape(fresh_man_contents, financials_info, "freshman_19_20_profile")
    profile_financial_aid_scrape(undergrad_contents, financials_info, "undergraduate_19_20_profile")

    remain_contents = aid_19_contents[16:]
    for content in remain_contents:
        try:
            remain_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            remain_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            financials_info[remain_title] = remain_info
        except:
            continue
    
    # Financial aid programs (only need last 2 attributes)

    program_contents = all_info[2].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")[-2:]

    for content in program_contents:
        program_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
        if program_title == "Work-Study Programs":
            program_info = content.find_elements(By.CLASS_NAME, value="TitleValue_value__1JT0d")
            for i in range(len(program_info)):
                program_info[i] = program_info[i].text
            financials_info[program_title] = program_info
        else:
            financials_info[program_title] = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text

    return financials_info

def profile_financial_aid_scrape(contents, financials_info, profile_name):

    for content in contents:
        try:
            aid_19_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            aid_19_info = ""
            if aid_19_title == "Merit-Based Gift":
                aid_19_info_list = content.find_elements(By.CLASS_NAME, value="TitleValue_value__1JT0d")
                for i in range(len(aid_19_info_list)):
                    aid_19_info_list[i] = aid_19_info_list[i].text
                aid_19_info = ", ".join(aid_19_info_list)
            else:
                aid_19_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text

            try:
                nested_dict = {}
                nested_info = content.find_elements(By.CLASS_NAME, value="TitleValue_nestedContainer__1_Fjk")
                nested_dict["Need-Based Gift"] = nested_info[1].find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                nested_dict["Need-Based Self-Help"] = nested_info[2].find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                financials_info[profile_name][aid_19_title] = {
                    "overall": aid_19_info,
                    "details": nested_dict
                }
            except:
                financials_info[profile_name][aid_19_title] = aid_19_info
        except:
            continue

## Test

In [149]:
get_general_college_data("Abilene Christian University", "https://waf.collegedata.com/college-search/Abilene-Christian-University")

  # This is added back by InteractiveShellApp.init_path()


{'Website': 'https://www.acu.edu/admissions-aid/undergraduate/financial-aid.html', 'Net Price Calculator': 'https://www.highered.texas.gov/apps/NPC/?Fice=003537', 'Application Deadline': 'Rolling', 'Award Notification': 'On a rolling basis beginning April 1', 'High School Program': 'College preparatory program is required', 'Methodology For Awarding Institutional Aid': 'Federal Methodology', 'forms required': {'FAFSA Code is 003537': 'Free'}, 'freshman_19_20_profile': {'Financial Aid Applicants': '830 (97.3%) of freshmen', 'Found to Have Financial Need': '669 (80.6%) of applicants', 'Received Financial Aid': '669 (100.0%) of applicants with financial need', 'Need Fully Met': '209 (31.2%) of aid recipients', 'Average Percent of Need Met': '70%', 'Average Award': {'overall': '$26,813', 'details': {'Need-Based Gift': 'Received by 666 (99.6%) of aid recipients, average amount $25,009', 'Need-Based Self-Help': 'Received by 389 (58.1%) of aid recipients, average amount $3,297'}}, 'Merit-Base