# College Tour Information Scraper

Gathers information from the https://www.youvisit.com/collegesearch/ website



### Messing around with the API

There are open API endpoints that we can exploit

Getting the university id (which gets us the direct link) to each of the colleges

In [1]:
import requests
import json
import math
from tqdm import tqdm

In [2]:
link = r"https://search.youvisit.com/institution-profiles?size=1&page=0"
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(link, headers = header)

In [3]:
json_response = json.loads(response.text)
total_schools = json_response["data"]["totalRecords"]
# assume a default page size of 10 items
total_pages = math.ceil(total_schools / 10)

In [5]:
univeristy_ids = []
for page_no in tqdm(range(total_pages)):
    link = r"https://search.youvisit.com/institution-profiles?size=10&page={}".format(page_no)
    r = requests.get(link, headers = header)
    json_r = json.loads(r.text)
    records = json_r["data"]["records"]
    for record in records:
        if record["has_virtual_tour"]:
            univeristy_ids.append(record["inst_id"])

100%|████████████████████████████████████████████████████████████████████████████████| 826/826 [01:34<00:00,  8.73it/s]


In [9]:
print("out of the total number of schools: {}, only {} had tours.".format(total_schools, len(univeristy_ids)))

out of the total number of schools: 8260, only 647 had tours.


In [193]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [11]:
DRIVER_PATH = "./chromedriver"
WEBSITE_URL = "https://www.youvisit.com/collegesearch/"

In [12]:
def wait_for_element(by_selector, selector, seconds=10):
    """
    Takes in a selector type and a selector and waits for the element to load on the page.
    
    :param: by_selector The method to use for selecting.
    :param: selector The string selector to use.
    :param: seconds How long to wait until a timeout is thrown.
    """
    try:
        item = WebDriverWait(driver, seconds).until(
            EC.presence_of_element_located((by_selector, selector)))
    except Exception:
        raise Exception("Could not find the specified selector '{}' using '{}'".format(selector, by_selector))
        
    return item

In [13]:
def click_on_element_from_selector(by_selector, selector, seconds=10):
    """
    Helper function to take a element and click on it.
    
    :param: by_selector The method to use for selecting.
    :param: selector The string selector to use.
    :param: seconds How long to wait until a timeout is thrown.
    """
    element = wait_for_element(by_selector, selector, seconds=10)
    element.click()

In [14]:
def send_keys_to_element_from_selector(by_selector, selector, text, seconds=10):
    """
    Helper funciton to take an element and send it key inputs.
    
    :param: by_selector The method to use for selecting.
    :param: selector The string selector to use.
    :param: text The string to send as input.
    :param: seconds How long to wait until a timeout is thrown.
    """
    element = wait_for_element(by_selector, selector, seconds=10)
    element.send_keys(text)

### Logging into the website

After digging around through the network traffic I found a way to avoid having to log in and get the links directly to each of the tour pages using an open API endpoint, but it still makes things easier

In [174]:
driver = webdriver.Chrome(DRIVER_PATH)
driver.get(WEBSITE_URL)
account_button_xpath = r"//*[@id='yv.com-cs-root']/div[1]/div[2]/div[1]/div[2]/button"
click_on_element_from_selector(By.XPATH, account_button_xpath, seconds=10)

In [175]:
email_button_xpath = r"/html/body/div[2]/div[1]/div/div/div/div/div/div/div/div/div[2]/div[6]/button"
click_on_element_from_selector(By.XPATH, email_button_xpath, seconds=10)

In [176]:
# The email text input is contained within an iframe that we have to switch into to get the content inside.
email_iframe_xpath = r"/html/body/div[13]/div/div/iframe"
email_iframe_element = wait_for_element(By.XPATH, email_iframe_xpath, seconds=10)
driver.switch_to.frame(email_iframe_element)

In [177]:
email_input_xpath = r"/html/body/div[1]/div[2]/div/div[1]/div[2]/div[2]/div[1]/input"
send_keys_to_element_from_selector(By.XPATH, email_input_xpath, "email@gmail.com", seconds=10)

In [178]:
email_submit_button_xpath = r"/html/body/div[1]/div[2]/div/div[2]/div/div/button"
click_on_element_from_selector(By.XPATH, email_submit_button_xpath, seconds=10)

In [179]:
first_name_input_xpath = r"/html/body/div[1]/div[2]/div/div[1]/div[2]/div[3]/div[1]/input"
send_keys_to_element_from_selector(By.XPATH, first_name_input_xpath, "asdfwer", seconds=10)

In [180]:
last_name_input_xpath = r"/html/body/div[1]/div[2]/div/div[1]/div[2]/div[4]/div[1]/input"
send_keys_to_element_from_selector(By.XPATH, last_name_input_xpath, "asdfwer", seconds=10)

In [181]:
birthdate_xpath = r"/html/body/div[1]/div[2]/div/div[1]/div[2]/div[5]/div[1]/input"
send_keys_to_element_from_selector(By.XPATH, birthdate_xpath, "10/10/2000", seconds=10)

In [182]:
zipcode_xpath = r"/html/body/div[1]/div[2]/div/div[1]/div[2]/div[6]/div[1]/input"
send_keys_to_element_from_selector(By.XPATH, zipcode_xpath, "07303", seconds=10)

In [183]:
submit_button_xpath = r"/html/body/div[1]/div[2]/div/div[2]/div/div/button"
click_on_element_from_selector(By.XPATH, submit_button_xpath, seconds=10)

### Getting pages

In [184]:
def get_location_name_list():
    selector = r"//*[@id='jumpMenu']/ul"
    element = wait_for_element(By.XPATH, selector, seconds=10)
    elements = element.find_elements_by_xpath(r"./li")
    return elements

In [185]:
def get_list_of_locations():
    locations = get_location_name_list()
    return [location.text for location in locations]
        

In [188]:
def check_for_sub_tours():
    selector = r"//*[@id='main']/div/div/div[3]/div[5]"
    
    element = wait_for_element(By.XPATH, selector, seconds=10)
    experience_element = element.find_elements_by_xpath("./div[2]/div[1]")
    if experience_element:
        print("AHHHHHHHHHHHHHHHH")

In [195]:
accessibility_link_xpath = r"//*[@id='accessible_version_button']"


for id_no in univeristy_ids[3:]:
    link = r"https://www.youvisit.com/tour/{}?fromSearch=1&&wph=1&skipPrompt=1&fromSearch=1".format(id_no)
    driver.get(link)
    try:
        check_for_sub_tours()
    except Exception:
        print("No subtour div")
    click_on_element_from_selector(By.XPATH, accessibility_link_xpath, seconds=15)
#     get_list_of_locations()

No subtour div


Exception: Could not find the specified selector '//*[@id='accessible_version_button']' using 'xpath'