# Inside Higher Ed Job Scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://careers.insidehighered.com/jobs/tenured-and-tenure-track/four-year-institution/north-america/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
import requests

In [31]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://careers.insidehighered.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [job_id,job_title,job_url,diversity_job]


def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"])


In [38]:
url = "https://careers.insidehighered.com/jobs/tenured-and-tenure-track/four-year-institution/north-america/{}"
# TODO: build a function that scrapes all the details pages until we hit one that we've seen prior OR
# We hit the end (in which case the url just returns the final page)
frames = [parse_list_page(url.format(i)) for i in range(1,57)] 
listing_df = pd.concat(frames)

In [102]:

def parse_details_page(url):
    """
        Parses the details page of a university
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    details_block = details_page.find("dl",attrs={"class":"grid"})
    
    diversity_profile = details_page.find(attrs={"class":"rss-diversity-description"})
    return diversity_profile
    employer,date_posted,salary = None,None,None
    
    if not details_block is None:
        employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
        if employer_html:
            employer = employer_html.find("dd").text.strip()
        return employer
    else:
        employer_html = details_page.find(attrs={"class":"main-container-university-information"})
        if 'text' in employer_html:
            employer = employer_html.text
        return employer

    return details_block
    
print(parse_details_page(listing_df.iloc[40]["Job URL"]))

<div class="rss-diversity-description">
<div class="rss-diversity-description-container"> </div>
</div>


In [101]:
listing_df.iloc[40]["Job URL"]

'https://careers.insidehighered.com/job/2162767/assistant-professor-in-mechanical-engineering/'