# College Tour Information Scraper

Gathers information from the https://www.youvisit.com/collegesearch/ website

Contact:
Ethan Haque (ethanhaque@princeton.edu)

In [32]:
import requests
import json
import math
import pandas as pd
from bs4 import BeautifulSoup as bs

### Gathering All Institution Ids

We can get the ids for each school by exploting open api endpoints. These ids give us part of what we need to get the information contained in the tours.

In [33]:
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/a39.0.2171.95 Safari/537.36'}
link = "https://search.youvisit.com/institution-profiles"

In [34]:
response = requests.get(link, headers=header)

In [35]:
TOTAL_RECORDS = 8264 # this may change but hard coding it and changing in the future is easy
RECORDS_PER_PAGE = 100
TOTAL_PAGES = math.ceil(TOTAL_RECORDS / RECORDS_PER_PAGE)

In [44]:
data = []
# getting all the institution ids along with some other useful information
for page_no in range(TOTAL_PAGES):
    institution_profiles = requests.get("{}?size={}&page={}".format(link, RECORDS_PER_PAGE, page_no), headers=header)
    json_data = json.loads(institution_profiles.text)
    for record in json_data["data"]["records"]:
        if record["has_virtual_tour"]:
            institution_id = record["inst_id"]
            institution_name = record["name"]
            institution_url = record["url"]
            data.append([institution_id, institution_name, institution_url])

In [51]:
institution_dataframe = pd.DataFrame(data, columns = ["institution-id", "institution-name", "institution-url"])

### Gathering Tour Information

Using the previously gathered institution ids, we can exploit another open api endpoint to get the information contained within the tours themselves.

In [90]:
link = "https://api.youvisit.com/v1.2/institutions"

In [127]:
# getting the tours avaliable for each institution
data = []
for index, row in institution_dataframe.iterrows():
    tour_info = requests.get("{}/{}".format(link, row["institution-id"]), headers=header)
    json_data = json.loads(tour_info.text)
    for location in json_data["data"][0]["locations"]:
        location_id = location["loc_id"]
        location_name = location["name"]
        institution_id = row["institution-id"]
        institution_name = row["institution-name"]
        institution_url = row["institution-url"]
        data.append([location_id, location_name, institution_id, institution_name, institution_url])

In [128]:
stops_dataframe = pd.DataFrame(data, columns = ["location-id", 
                                                "location-name", 
                                                "institution-id", 
                                                "institution-name", 
                                                "institution-url"])

### Gathering Stops on Tours

By tweaking the earlier api call we can get out the stops information from the webserver.

In [137]:
link_end = "stops?expand=all&allowInProgress=locations,tours&limit=1000&env=www"
stops_info = requests.get("{}/{}/locations/{}/{}".format(link, 60018, 80314, link_end), headers=header)

In [145]:
# getting the locations for each individual tour
data = []
for index, row in stops_dataframe.iterrows():
    stops_info = requests.get("{}/{}/locations/{}/{}".format(link, 
                                                             row["institution-id"], 
                                                             row["location-id"], 
                                                             link_end), 
                                                    headers=header)
    
    stops_data = json.loads(stops_info.text)
    data.append([key["title"] for key in stops_data["data"]]) # getting the name of all the stops from in the tour.

In [147]:
stops_dataframe["stops"] = data

In [152]:
stops_dataframe.to_csv("./data/stops-info.csv", sep="\t", encoding="utf-8")