In [1]:
"""
For this webscraper, the job description and requirements were combined into the same cell because they were not 
easily separated from the site's html. 
For future look at separation, a strategy could be to create a list containing strings of the bolded text 
(found with <strong> in body) and compare this against the entirety of the description and requirements text
(description_data.getText()) to section off each headline and concatenate sections according to whether or not
they are contained within a word bank such as ["Background", "BACKGROUND", "Purpose"]
"""

import requests
from bs4 import BeautifulSoup as bs
import re
import html
 
import pandas as pd

url = 'https://jobs.unicef.org/en-us/listing/'

#using headers because originally it wasn't letting us access any of the soup - adding headers allowed us to access information on the website
#you may need to do this for some of the medium webscrapers!
#If you need help accessing headers lmk
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}

#requesting a session in the specific url
with requests.Session() as s:
    r = s.get(url, headers = headers)
    src = r.content
    soup = bs(src)
    soup.prettify
    
    #in UNICEF we have to click a button in order to load all of the job postings 
    button_count = soup.find('span', {'class':'count'})
    
    count = int(button_count.text) + 20
    new_url = url + "?page=1&page-items=" + str(count)
    
    #getting the soup for the new job postings loaded
    r2 = s.get(new_url, headers=headers)
    src2 = r2.content
    soup2 = bs(src2)
    soup2.prettify
      
    #navigating the page and finding the job posting urls and appending it to allPages
    allPages = []
    data = soup2.find_all('a', {'class':'job-link'})
    for link in data:
        url = link.get('href')
        #only appending the links that aren't already in allPages
        if url not in allPages:
            allPages.append(url)
            
    page_url = [] 
    job_position = [] 
    des_and_req = [] 
    location = [] 
    ImmEcs = []
    organization = []
    
    #need the root url because href only has the second part of the url
    root_url= 'https://jobs.unicef.org'
    
    #looping through each page in allPages 
    for page in allPages:
        result = s.get(root_url + page)
        page_source = result.content
        soup = bs(page_source) 
        soup.prettify
        
        #stripping all of the content from each page and searching for the key words 
        for script in soup(['script','style']):
            script.decompose()
        strips = list(soup.stripped_strings)
        strips = str(strips)

        immunization = ['Immunization', 'immunisation', 'vaccine', 'vaccines','vaccine-preventable diseases', 'vpd outbreak',
            'immunization campaign', 'SIA','supplemental immunization act ivities', 'cold chain', 'GAVI','shigella', 'cholera',
            'bcg', 'dtp', 'dpt', 'measles', 'influenza', 'conjugate vaccine']

        economics = ['Economics','expenditure tracking', 'financing', 
            'value for vaccination' , 'costing', 'economic analysis','costs' , 'equity', 'cost effectiveness', 'cost-effectiveness', 
            'cost benefit analysis', 'benefit-cost analysis','cost utility analysis','budget impact analysis' , 'budget' , 'budgeting' , 
            'GAVI','funding gap','fiscal']

        #checking for Immunization and Economic key words
        imm_result = any(ele in strips for ele in immunization)
        ec_result = any(ele in strips for ele in economics)

         #only enter the if statement if the page has an Immunization or Economics key word
        if imm_result or ec_result:
            #appending both/immunziation/economics to the column ImmEcs
            if (imm_result and ec_result): ImmEcs.append('Both')
            elif imm_result: ImmEcs.append('Immunization')
            else: ImmEcs.append('Economics')

            #appending page url
            page_url.append(root_url + page)
            
            #appending job position name
            job_data = soup.find('div', id = 'job-content')
            job_position.append(job_data.h2.getText())
            
            #appending description and requirements data together 
            #again, having trouble separating the two
            description_data = soup.find('div', id = 'job-details')
            str_description_data = description_data.getText()
            des_and_req.append(str_description_data)

            #appending location
            location_data = soup.find('span', {'class': 'location'})
            location.append(location_data.getText())
            
            #appending organization
            organization.append('UNICEF')

DataFrame = pd.DataFrame() 
DataFrame['Page Url']= page_url 
DataFrame['Job']= job_position 
DataFrame['Location'] = location
DataFrame['Type'] = ImmEcs
DataFrame['Description'] = des_and_req
DataFrame['Organization'] = organization


Data = DataFrame.drop_duplicates() 
Data.to_csv("UNICEF_Data.csv")

print('Webscraping complete')

Webscraping complete
