In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

import time
import json

In [2]:
DRIVER_PATH = "../driver/chromedriver"

OUTPUT_PATH = "./data/collegedata_general.json"

collegedata_general = {}

# metric representation for selection of student
importance = ["Very Important", "Important", "Considered", "Not Considered"]

## General Information Scrape Function

In [3]:
# scrape function:
# 1. access each college page using college_links.json
# 2. scrape each page
# 3. wrap up in json format and close

def get_general_college_data(college_name, college_link, sleep=0.5):
    driver = webdriver.Chrome(executable_path=DRIVER_PATH)
    cur_college_data = {}
    cur_college_data["admission"] = {}
    cur_college_data["financials"] = {}
    cur_college_data["academics"] = {}
    cur_college_data["campus_life"] = {}
    cur_college_data["students"] = {}

    # Admission Page

    driver.get(college_link + "/admission")
    time.sleep(sleep)
    try:
        admission_body = WebDriverWait(driver, 1).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        cur_college_data["admission"] = get_admission(admission_body, sleep)
    except:
        pass

    # Financials Page

    driver.get(college_link + "/money-matters")
    time.sleep(sleep)
    try:
        financials_body = WebDriverWait(driver, 1).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        cur_college_data["financials"] = get_financials(financials_body, sleep)
    except:
        pass
    
    # Academics Page

    driver.get(college_link + "/academics")
    time.sleep(sleep)
    try:
        academics_body = WebDriverWait(driver, 1).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        cur_college_data["academics"] = get_academics(academics_body, sleep)
    except:
        pass

    # Campus Life page

    driver.get(college_link + "/campus-life")
    time.sleep(sleep)
    try:
        campus_life_body = WebDriverWait(driver, 1).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        cur_college_data["campus_life"] = get_campus_life(campus_life_body, sleep)
    except:
        pass

    # Students Page

    driver.get(college_link + "/students")
    time.sleep(sleep)
    try:
        students_body = WebDriverWait(driver, sleep).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        cur_college_data["students"] = get_students(students_body, sleep)
    except:
        pass
    # quit
    driver.quit()
    collegedata_general[college_name] = cur_college_data


## Admission page function

In [4]:
def get_admission(admission_body, sleep=0.5):
    admission_info = {}

    all_info = admission_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Freshman admission requirements

    admission_info["high_school_req_rec_data"] = {}
    admission_info["examinations"] = {}
    try:
        high_school_req_rec = admission_body.find_element(By.CSS_SELECTOR, value=".Table_unlined__29Tmx > tbody")
        high_school_req_rec_table = high_school_req_rec.find_elements_by_css_selector("tr")
        for unit in high_school_req_rec_table:
            elements = unit.find_elements_by_css_selector("td")
            admission_info["high_school_req_rec_data"][elements[0].text] = {
                "required": int(elements[1].text) if elements[1].text != '' else None,
                "recommended": int(elements[2].text) if elements[2].text != '' else None
            }
    except:
        admission_info["high_school_req_rec_data"] = {}
    
    try:
        freshman_req_tables = admission_body.find_elements(By.CSS_SELECTOR, value=".Table_unlined__29Tmx > tbody")
        examinations = freshman_req_tables[1]
        examinations_table = examinations.find_elements(By.TAG_NAME, value="tr")
        for exam in examinations_table:
            exam_elements = exam.find_elements(By.TAG_NAME, value="td")
            admission_info["examinations"][exam_elements[0].text] = {
                "Required Units": exam_elements[1].text,
                "Due in Admissions Office": exam_elements[2].text
            }
    except:
        admission_info["examinations"] = {}

    time.sleep(sleep)

    # Applying for admission

    apply_adm = all_info[1]
    adm_contents = apply_adm.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for adm_content in adm_contents:
        try:
            title = adm_content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            info = adm_content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            admission_info[title] = info
        except:
            continue

    # Selection of students

    admission_info["selection_of_students"] = {}
    
    try:
        selection_of_students_table = all_info[2].find_element(By.TAG_NAME, value="tbody")
        metrics = selection_of_students_table.find_elements(By.TAG_NAME, value="tr")

        for metric in metrics:
            elements = metric.find_elements(By.TAG_NAME, value="td")
            index = 0
            metric_name = ""
            for element in elements:
                if index == 0:
                    metric_name = element.text
                else:
                    if element.text == "X":
                        admission_info["selection_of_students"][metric_name] = importance[index - 1]
                        break
                index += 1
    except:
        admission_info["selection_of_students"] = {}
    
    # Profile of Fall Application

    fall_app_prof = all_info[3]
    prof_contents = fall_app_prof.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for prof_content in prof_contents:
        try:
            text_title = prof_content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            text_info = prof_content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            admission_info[text_title] = text_info
        except:
            pass

        try:
            histogram = prof_content.find_element(By.CLASS_NAME, value="Histogram_container__12lhQ")
        except:
            continue

        try:
            histogram_title = prof_content.find_element(By.CLASS_NAME, value="CollegeProfileContent_barGraphTitle__1uzxw").text
            histogram_title_list = histogram_title.split(": ")
            test_title = ""
            overall_intro = ""
            if len(histogram_title_list) == 1:
                test_title = "ACT"
                overall_intro = histogram_title_list[0]
            else:
                [test_title, overall_intro] = histogram_title_list
            admission_info[test_title] = {
                "overall": overall_intro,
                "data": histogram_scrape(prof_content)
            }
        except:
            admission_info["GPA"] = histogram_scrape(prof_content)

    return admission_info

# histogram process function
def histogram_scrape(prof_content):
    histogram_data = {}
    histogram = prof_content.find_element(By.CLASS_NAME, value="Histogram_container__12lhQ")
    histogram_attr = histogram.find_elements(By.CLASS_NAME, value="Histogram_label__1j_Ku")
    histogram_val = histogram.find_elements(By.CLASS_NAME, value="Histogram_value__3cCAB")
    for each_attr, each_val in zip(histogram_attr, histogram_val):
        histogram_data[each_attr.text] = each_val.text
    return histogram_data

## Financials page function

In [5]:
def get_financials(financials_body, sleep=0.5):
    financials_info = {}

    all_info = financials_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Applying for financial aid

    fin_app_contents = all_info[0].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in fin_app_contents:
        try:
            fin_app_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            try:
                link_info = content.find_element(By.CLASS_NAME, value="TitleValue_link__1veWn")
                financials_info[fin_app_title] = link_info.get_attribute('href')
            except:
                financials_info[fin_app_title] = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
        except:
            pass

        try:
            fin_app_table = content.find_element(By.CLASS_NAME, value="Table_unlined__29Tmx")
            financials_info["forms required"] = {}
            table_elements = fin_app_table.find_elements(By.CSS_SELECTOR, value="tbody > tr")
            for table_element in table_elements:
                table_element_detail = table_element.find_elements(By.TAG_NAME, value="td")
                financials_info["forms required"][table_element_detail[0].text] = table_element_detail[1].text
        except:
            continue
    
    # Profile of 19-20 financial aid

    aid_19_contents = all_info[1].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    fresh_man_contents = aid_19_contents[0:8]
    undergrad_contents = aid_19_contents[8:16]

    financials_info["freshman_19_20_profile"] = {}
    financials_info["undergraduate_19_20_profile"] = {}

    profile_financial_aid_scrape(fresh_man_contents, financials_info, "freshman_19_20_profile")
    profile_financial_aid_scrape(undergrad_contents, financials_info, "undergraduate_19_20_profile")

    remain_contents = aid_19_contents[16:]
    for content in remain_contents:
        try:
            remain_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            remain_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            financials_info[remain_title] = remain_info
        except:
            continue
    
    # Financial aid programs (only need last 2 attributes)

    program_contents = all_info[2].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")[-2:]

    for content in program_contents:
        program_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
        if program_title == "Work-Study Programs":
            program_info = content.find_elements(By.CLASS_NAME, value="TitleValue_value__1JT0d")
            for i in range(len(program_info)):
                program_info[i] = program_info[i].text
            financials_info[program_title] = program_info
        else:
            financials_info[program_title] = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text

    return financials_info

def profile_financial_aid_scrape(contents, financials_info, profile_name):

    for content in contents:
        try:
            aid_19_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            aid_19_info = ""
            if aid_19_title == "Merit-Based Gift":
                aid_19_info_list = content.find_elements(By.CLASS_NAME, value="TitleValue_value__1JT0d")
                for i in range(len(aid_19_info_list)):
                    aid_19_info_list[i] = aid_19_info_list[i].text
                aid_19_info = ", ".join(aid_19_info_list)
            else:
                aid_19_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text

            try:
                nested_dict = {}
                nested_info = content.find_elements(By.CLASS_NAME, value="TitleValue_nestedContainer__1_Fjk")
                nested_dict["Need-Based Gift"] = nested_info[1].find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                nested_dict["Need-Based Self-Help"] = nested_info[2].find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                financials_info[profile_name][aid_19_title] = {
                    "overall": aid_19_info,
                    "details": nested_dict
                }
            except:
                financials_info[profile_name][aid_19_title] = aid_19_info
        except:
            continue

## Academics page function

In [6]:
def get_academics(academics_body, sleep=0.5):

    academics_info = {}
    catalog_link = ""
    try:
        academic_overview = academics_body.find_elements(By.CSS_SELECTOR, value=".AcademicsContent_contentCard__L5fPf > .CollegeProfileContent_content__1hJCl")
        catalog_link = academic_overview[3].find_element(By.TAG_NAME, value="a").get_attribute('href')
    except:
        pass
    academics_info["General Catalogue / Bulletin"] = catalog_link

    all_info = academics_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Undergraduate Education

    undergrad_edu_contents = all_info[0].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in undergrad_edu_contents:
        try:
            ug_edu_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            if ug_edu_title == "Study Abroad":
                ug_edu_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                academics_info[ug_edu_title] = ug_edu_info
            else:
                ug_edu_info = content.find_elements(By.CLASS_NAME, value="TitleValue_value__1JT0d")
                for i in range(len(ug_edu_info)):
                    ug_edu_info[i] = ug_edu_info[i].text
                academics_info[ug_edu_title] = ug_edu_info
        except:
            continue

    # Curriculum and graduation requirements

    curr_grad_req_contents = all_info[1].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in curr_grad_req_contents:
        try:
            curr_grad_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            curr_grad_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            academics_info[curr_grad_title] = curr_grad_info
        except:
            continue
    
    # Faculty and instruction

    fac_ins_contents = all_info[2].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in fac_ins_contents:
        try:
            barplot_title = content.find_element(By.CLASS_NAME, value="CollegeProfileContent_barGraphTitle__1uzxw").text
            barplot_attr = content.find_elements(By.CLASS_NAME, value="Histogram_label__1j_Ku")
            barplot_data = content.find_elements(By.CLASS_NAME, value="Histogram_value__3cCAB")
            academics_info[barplot_title] = {}
            for each_attr, each_data in zip(barplot_attr, barplot_data):
                academics_info[barplot_title][each_attr.text] = each_data.text
        except:
            continue
    
    # Advanced placement

    ap_contents = all_info[3].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in ap_contents:
        try:
            ap_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            ap_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            academics_info[ap_title] = ap_info
        except:
            continue
    
    # Academic resources

    resource_contents = all_info[4].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in resource_contents:
        try:
            resource_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            resource_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            academics_info[resource_title] = resource_info
        except:
            continue
    
    # Academic support services

    sup_service_contents = all_info[5].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in sup_service_contents:
        try:
            sup_service_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            sup_service_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            academics_info[sup_service_title] = sup_service_info
        except:
            continue

    return academics_info


## Campus life page function

In [7]:
def get_campus_life(campus_life_body, sleep=0.5):
    campus_life_info = {}

    all_info = campus_life_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Location and setting
    location_contents = all_info[0].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in location_contents:
        try:
            location_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            location_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            campus_life_info[location_title] = location_info
        except:
            continue
    
    # Housing
    housing_contents = all_info[1].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in housing_contents:
        try:
            housing_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            housing_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            campus_life_info[housing_title] = housing_info
        except:
            continue

    # Security
    security_contents = all_info[2].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in security_contents:
        try:
            security_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            security_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            campus_life_info[security_title] = security_info
        except:
            continue
    
    # Personal support services
    personal_sup_contents = all_info[3].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in personal_sup_contents:
        try:
            personal_sup_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            personal_sup_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            campus_life_info[personal_sup_title] = personal_sup_info
        except:
            continue
    
    # Sports & Recreation
    sport_contents = all_info[4].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in sport_contents:
        try:
            sport_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            if sport_title == "Mascot":
                sport_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                campus_life_info[sport_title] = sport_info
                break
        except:
            continue
    
    return campus_life_info

## Students page function

In [8]:
def get_students(students_body, sleep=0.5):
    students_info = {}

    all_info = students_body.find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_expandableChildren__3bIKY")

    # Student Activities
    activity_contents = all_info[0].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in activity_contents:
        try:
            activity_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            if activity_title == "Sororities" or activity_title == "Fraternities":
                activity_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
                students_info[activity_title] = activity_info
        except:
            continue
    
    # After Graduation
    aft_grad_contents = all_info[3].find_elements(By.CSS_SELECTOR, value=".CollegeProfileContent_content__1hJCl > .CollegeProfileContent_content__1hJCl")

    for content in aft_grad_contents:
        try: 
            aft_grad_title = content.find_element(By.CLASS_NAME, value="TitleValue_title__2-afK").text
            aft_grad_info = content.find_element(By.CLASS_NAME, value="TitleValue_value__1JT0d").text
            students_info[aft_grad_title] = aft_grad_info
        except:
            continue

    return students_info

## Test

In [9]:
# get_general_college_data("Abilene Christian University", "https://waf.collegedata.com/college-search/Abilene-Christian-University")

  if __name__ == '__main__':
  if sys.path[0] == '':
  


## Scrape General Information

In [None]:
index = 1

with open("./data/college_links.json") as f:
    links = json.load(f)
    for each_college in links["colleges"]:
        get_general_college_data(each_college["college_name"], each_college["link"])
        print(index)
        index += 1

## Export as json File

In [185]:
with open(OUTPUT_PATH, "w") as f:
    f.write(json.dumps(collegedata_general, indent=4))