In [None]:
import requests
from bs4 import BeautifulSoup as bs 
import re

import pandas as pd

url = 'https://jobs.hopkinsmedicine.org/jobs'

#requesting a session in the specific url
with requests.Session() as s: 
    r = s.get(url)
    src = r.content 
    soup = bs(src)
    soup.prettify()
    
    #Making sure we go through ever PAGE of job postings - not just the first page
    pagination = soup.find_all('a', class_ = 'page')
    last = int(pagination[-1].getText())

    #have to do this again for the rest of the pages 
    allPages = []
    for job_page in range(last):
        r = s.get(url + '?page_jobs=' + str(job_page + 1))
        src = r.content
        soup = bs(src)
        soup.prettify()
        
        #navigating the page and finding the job posting urls and appending it to allPages
        data = soup.find_all('div', class_ = 'job')
        for link in data:
            job_url = link.find('a')
            #only appending the links that aren't already in allPages
            if job_url not in allPages:
                allPages.append(job_url.get('href')) 
    
    page_url = [] 
    job_position = [] 
    des_and_req = [] 
    location = [] 
    ImmEcs = []
    organization = []
    
    #need the root url because href only has the second part of the url
    root_url= 'https://jobs.hopkinsmedicine.org'
    #will explain use of count later 
    count = 0
    #looping through each page in allPages 
    for page in allPages:
        result = s.get(root_url + page) 
        page_source = result.content 
        soup = bs(page_source) 
        soup.prettify()
        
        #stripping all of the content from each page and searching for the key words 
        for script in soup(['script','style']): 
            script.decompose()
        strips = list(soup.stripped_strings)
        strips = str(strips)
            
        immunization = ['Immunization', 'immunisation', 'vaccine', 'vaccines','vaccine-preventable diseases', 'vpd outbreak',
            'immunization campaign', 'SIA','supplemental immunization act ivities', 'cold chain', 'GAVI','shigella', 'cholera',
            'bcg', 'dtp', 'dpt', 'measles', 'influenza', 'conjugate vaccine']

        economics = ['Economics','expenditure tracking', 'financing', 
            'value for vaccination' , 'costing', 'economic analysis','costs' , 'equity', 'cost effectiveness', 'cost-effectiveness', 
            'cost benefit analysis', 'benefit-cost analysis','cost utility analysis','budget impact analysis' , 'budget' , 'budgeting' , 
            'GAVI','funding gap','fiscal']

        #checking for Immunization and Economic key words
        imm_result = any(ele in strips for ele in immunization)
        ec_result = any(ele in strips for ele in economics)
        
        #making sure none of the job postings are empty, it it's empty we don't go into the if statement with the job posting
        job_data = soup.find('div',{'class':'job-details-content'})
        if job_data.find('div') == None:
            count += 1
            continue
        div = list(job_data.find('div'))
        
        #only enter the if statement if the page has an Immunization or Economics key word
        if imm_result or ec_result:
            #appending both/immunziation/economics to the column ImmEcs
            if (imm_result and ec_result): ImmEcs.append('Both')
            elif imm_result: ImmEcs.append('Immunization')
            else: ImmEcs.append('Economics')
            
            #appending page url
            page_url.append(root_url + page)

            #appending job position name
            h1 = job_data.find('h1')
            job_position.append(h1.getText())
            
            #appending the description and requirements together in one column because unable to separate
            text = ''
            for tag in div:
                if tag == ' ':
                    continue
                text += tag.getText() + '\n'
            des_and_req.append(text)
            
            #appending the location 
            location_data = soup.find('body')
            string = location_data.getText()
            start = string.find('Location: ') + len('Location: ')
            end = string.find('Category: ')
            loc = string[start:end]
            loc = loc.replace('\n','')
            location.append(loc)
            
            #appending JSI to organization 
            organization.append('Johns Hopkins')

DataFrame = pd.DataFrame() 
DataFrame['Page Url']= page_url 
DataFrame['Job']= job_position 
DataFrame['Description and Requirements'] = des_and_req
DataFrame['Location'] = location
DataFrame['ImmEcs'] = ImmEcs
DataFrame['Organization'] = organization

Data = DataFrame.drop_duplicates() 
Data.to_csv("JohnsHopkins_Data.csv")

print('Webscraping complete')