# College Tour Information Scraper

Gathers information from the https://www.youvisit.com/collegesearch/ website

Contact:
Ethan Haque (ethanhaque@princeton.edu)

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import requests
import json
import random
import queue
import pandas as pd

from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

from math import ceil
from time import sleep

from bs4 import BeautifulSoup as bs

### Gathering All Institution Ids

We can get the ids for each school by exploting open api endpoints. These ids give us part of what we need to get the information contained in the tours.

In [None]:
link = "https://search.youvisit.com/institution-profiles"

In [None]:
# list of a bunch of user agents to cycle through
user_agents = user_agents = open('./data/user_agents.txt').read().splitlines() 
user_agent = random.choice(user_agents)

# can use this to automatically set total records
response = requests.get(link, headers={'User-Agent': user_agent}) 

In [None]:
# this may change but hard coding it and changing in the future is easy
TOTAL_RECORDS = 8271 
RECORDS_PER_PAGE = 100
TOTAL_PAGES = ceil(TOTAL_RECORDS / RECORDS_PER_PAGE)

In [None]:
def get_insitution_data(page_no):
    """Grabbing important instituion data"""
    data = []
    user_agent = random.choice(user_agents)
    institution_profiles = requests.get("{}?size={}&page={}".format(link, RECORDS_PER_PAGE, page_no), 
                                        headers={'User-Agent': user_agent})
    json_data = json.loads(institution_profiles.text)
    for record in json_data["data"]["records"]:
        if record["has_virtual_tour"]:
            institution_id = record["inst_id"]
            institution_name = record["name"]
            institution_url = record["url"]
            data.append([institution_id, institution_name, institution_url])
    
    return data

In [None]:
# getting all the institution ids along with some other useful information
results = queue.Queue()
with (tqdm(range(TOTAL_PAGES))) as pbar:
    with ThreadPoolExecutor(max_workers=16) as executor:
        for data in executor.map(get_insitution_data, range(TOTAL_PAGES)):
            for institution in data:
                results.put(institution)
            pbar.update(1)

In [None]:
combined_results = []
while not results.empty():
    combined_results.append(results.get())
institution_dataframe = pd.DataFrame(combined_results, columns = ["institution-id", "institution-name", "institution-url"])
institution_dataframe.to_csv("./data/inst_ids.csv", index=False)

### Gathering Tour Ids

Using the previously gathered institution ids, we can exploit another open api endpoint to get the tour ids for each individual location

In [None]:
locations_link = "https://api.youvisit.com/v1.2/institutions/{}"

In [None]:
def grab_institution_tour_location_info(institution_id, institution_name):
    """Grabs info from api endpoint related to an institution, which contains tour ids."""
    data = []
    user_agent = random.choice(user_agents)
    tour_info = requests.get(locations_link.format(institution_id), headers={'User-Agent': user_agent})
    json_data = json.loads(tour_info.text)
    if json_data["data"]:
        for locaiton in json_data["data"][0]["locations"]:
            location_id = locaiton["loc_id"]
            data.append([institution_id, institution_name, location_id])
    return data 
    
def tour_location_ids_helper(row):
    """Takes in a dataframe row and calls proper methods"""
    return grab_institution_tour_location_info(row["institution-id"], row["institution-name"])

In [None]:
# getting all the institution ids along with some other useful information
results = queue.Queue()
with (tqdm(range(institution_dataframe.shape[0]))) as pbar:
    with ThreadPoolExecutor(max_workers=16) as executor:
        for data in executor.map(tour_location_ids_helper, [row for index, row in institution_dataframe.iterrows()]):
            if data:
                for institution in data:
                    results.put(institution)
            pbar.update(1)

In [None]:
combined_results = []
while not results.empty():
    combined_results.append(results.get())
location_dataframe = pd.DataFrame(combined_results, columns = ["institution-id", "institution-name", "location-id"])
location_dataframe.to_csv("./data/location_ids.csv", index=False)

### Gathering Stops on Tours

By tweaking the earlier api call we can get out the stops information from the webserver.

In [None]:
tour_data_link = "https://api.youvisit.com/v1.2/institutions/{}/locations/{}/stops?expand=all&allowInProgress=locations,tours&limit=1000&env=www"

In [None]:
def save_json_response(output_path, info):
    """Saves content to a file."""
    with open(output_path, 'w') as outf:
        json.dump(info, outf)
        
def grab_tour_info(institution_id, tour_id):
    """
    Grabs info from api endpoint containing information related to the stops on a tour with the corresponding
    content for the media on the tour.
    """
    user_agent = random.choice(user_agents)
    tour_info = requests.get(tour_data_link.format(institution_id, tour_id), headers={'User-Agent': user_agent})
    json_data = json.loads(tour_info.text)
    return json_data

def save_tour_info(institution_id, tour_id, institution_name):
    """Saves json response for tour info."""
    info = grab_tour_info(institution_id, tour_id)
    output_folder = "./data/tour_media_info"
    output_file_path = "{}/{}-{}-{}.json".format(output_folder, institution_name, institution_id, tour_id)
    save_json_response(output_file_path, info)
    
def save_tour_info_helper(row):
    """Helper for the save_tour_info method"""
    save_tour_info(row["institution-id"], row["location-id"], row["institution-name"])

def grab_media_from_stop(stop):
    """Grabs the array of photos and panoramas from a stop."""
    return stop["photos"], stop["panoramas"]

def get_important_media(institution_id, institution_name, tour_id, title, item, media_type):
    """Gets important information from panos and photos."""
    return [
        institution_id,
        institution_name,
        tour_id,
        title,
        item["id"],
        item["title"],
        item["description"],
        media_type
    ]

def gather_media_from_tour_info(institution_id, tour_id, institution_name):
    """Creates an array of data for each tour and saves the requested data to create a backup."""
    info = grab_tour_info(institution_id, tour_id)
    stops = info["data"]
    data = []
    if stops:
        for stop in stops:
            photos, panos = grab_media_from_stop(stop)
            title = stop["title"]
            for photo in photos:
                important_data = get_important_media(institution_id, institution_name, tour_id, title, photo, "photo")
                data.append(important_data)
                
            for pano in panos:
                important_data = get_important_media(institution_id, institution_name, tour_id, title, pano, "pano")
                data.append(important_data)
            
            
    save_tour_info(institution_id, tour_id, institution_name)
    return data

def gather_media_from_tour_info_helper(row):
    """
    Helper method for gather_media_from_tour_info that wraps the function call allowing a row in the df
    to be the input.
    """
    return gather_media_from_tour_info(row["institution-id"], row["location-id"], row["institution-name"])

In [None]:
results = queue.Queue()
with (tqdm(range(location_dataframe.shape[0]))) as pbar:
    with ThreadPoolExecutor(max_workers=16) as executor:
        for data in executor.map(gather_media_from_tour_info_helper, [row for index, row in location_dataframe.iterrows()]):
            if data:
                for media_item in data:
                    results.put(media_item)
            pbar.update(1)

In [None]:
combined_results = []
while not results.empty():
    combined_results.append(results.get())
    
media_dataframe = pd.DataFrame(combined_results, columns = [
    "institution_id", 
    "institution_name", 
    "tour_id",
    "stop_title",
    "media_id",
    "media_title",
    "media_description",
    "media_type"
])

media_dataframe.to_csv("./data/media_info.csv", index=False)

### Downloading All Unique Content

Downloads all the photos and panoramas served on the website for each tour.

In [None]:
# Lots of media are used multiple times in the same tour so drop duplicate media_id's
# Description and other data points might be different, but that does not matter.
# Those pieces of data can be cross-referenced later.
unique_media = media_dataframe.drop_duplicates(subset=['media_id'], keep='first')

In [None]:
# https://www.youvisit.com/media/LOC_ID/MEDIA_TYPE/ID/SIZE
media_url = "https://www.youvisit.com/media/{}/{}/{}/{}.jpg"
output_dir = "./data/all_media"
size = "2048"
media_types = {
    "photo": "photos",
    "pano": "panoramas"
}

In [None]:
def save_media(output_path, media):
    """Saves images to path."""
    with open(output_path, 'wb') as handler:
        handler.write(media)
        
def grab_media(location_id, media_type, media_id, size):
    """Grabs media from url."""
    url = media_url.format(location_id, media_type, media_id, size)
    media = requests.get(url).content
    return media

def grab_and_save_media(row):
    """Saves media from info in row of dataframe."""
    location_id = row["tour_id"]
    media_type = media_types[row["media_type"]]
    media_id = row["media_id"]
    media = grab_media(location_id, media_type, media_id, size)
    
    output_path = "{}/{}.jpg".format(output_dir, media_id)
    save_media(output_path, media)

In [None]:
# getting all the media and saving them
with (tqdm(range(unique_media.shape[0]))) as pbar:
    with ThreadPoolExecutor(max_workers=16) as executor:
        for data in executor.map(grab_and_save_media, [row for index, row in unique_media.iterrows()]):
            pbar.update(1)