In [None]:
import requests
from bs4 import BeautifulSoup as bs 
import re

import pandas as pd

url = 'https://careers.jsi.com/JSIInternet/Careers/jobpostings.cfm'

#requesting a session in the specific url
with requests.Session() as s: 
    r = s.get(url)
    src = r.content 
    soup = bs(src)
    soup.prettify()
    
    #navigating the page and finding the job posting urls and appending it to allPages
    allPages = []
    data = soup.find_all('li', class_ = 'opensquare opensquare-link-list-item')
    for link in data:
        url = link.find('a')
        #only appending the links that aren't already in allPages
        if url not in allPages:
            allPages.append(url.get('href')) 
    
    page_url = [] 
    job_position = [] 
    description = [] 
    location = [] 
    requirements = []
    ImmEcs = []
    organization = []
    
    #need the root url because href only has the second part of the url
    root_url= 'https://careers.jsi.com'
    
    #looping through each page in allPages 
    for page in allPages:
        result = s.get(root_url + page) 
        page_source = result.content 
        soup = bs(page_source) 
        soup.prettify()
        
        #stripping all of the content from each page and searching for the key words 
        for script in soup(['script','style']): 
            script.decompose()
        strips = list(soup.stripped_strings)
        strips = str(strips)
            
        immunization = ['Immunization', 'immunisation', 'vaccine', 'vaccines','vaccine-preventable diseases', 'vpd outbreak',
            'immunization campaign', 'SIA','supplemental immunization act ivities', 'cold chain', 'GAVI','shigella', 'cholera',
            'bcg', 'dtp', 'dpt', 'measles', 'influenza', 'conjugate vaccine']

        economics = ['Economics','expenditure tracking', 'financing', 
            'value for vaccination' , 'costing', 'economic analysis','costs' , 'equity', 'cost effectiveness', 'cost-effectiveness', 
            'cost benefit analysis', 'benefit-cost analysis','cost utility analysis','budget impact analysis' , 'budget' , 'budgeting' , 
            'GAVI','funding gap','fiscal']
        
        #checking for Immunization and Economic key words
        imm_result = any(ele in strips for ele in immunization)
        ec_result = any(ele in strips for ele in economics)

        #only enter the if statement if the page has an Immunization or Economics key word
        if imm_result or ec_result:
            #appending both/immunziation/economics to the column ImmEcs
            if (imm_result and ec_result): ImmEcs.append('Both')
            elif imm_result: ImmEcs.append('Immunization')
            else: ImmEcs.append('Economics')
            
            #appending page url
            page_url.append(root_url + page)

            #appending job position name
            job_position_data = soup.find('div', id = 'job-description-container')
            h3 = job_position_data.find('h3')
            job_position.append(h3.getText())

            #scraping description and requirements 
            des_res_data = soup.find_all('div',attrs={'style':'margin-top:5px;margin-bottom:15px;text-align:left;'})
            for data in des_res_data:
                #making the data into a string and then cutting the information we need whether its requirements or description
                desres = str(des_res_data)
                #print(desres)
                start1 = desres.find('[<div style="margin-top:5px;margin-bottom:15px;text-align:left;">') + len('[<div style="margin-top:5px;margin-bottom:15px;text-align:left;">')
                end1 = desres.find('</div>, <div style="margin-top:5px;margin-bottom:15px;text-align:left;"')
                text1 = desres[start1:end1]
                text1 = re.sub('<\S*>', '', text1)
                text1 = re.sub('<a href="\S*">', '', text1)
                #appending if not already in description
                if text1 not in description:
                    description.append(text1)
                start2 = desres.find('</div>, <div style="margin-top:5px;margin-bottom:15px;text-align:left;">1.') + len('</div>, <div style="margin-top:5px;margin-bottom:15px;text-align:left;">1.')
                end2 = desres.find('</div>]')
                text2 = desres[start2:end2]
                text2 = re.sub('<\S*>', '', text2)
                text2 = re.sub('<div style="\S*">', '', text2)
                text2 = re.sub('<a href="\S*">', '', text2)
                #appending if not already in requirements
                if text2 not in requirements:
                    requirements.append(text2)
            #print(description)
            #print(requirements)
            
            #scraping the location data and appending to location
            location_data = soup.find_all('div', id = 'job-description-container') 
            for data in location_data:
                p = data.find('p')
                #print(p)
                if p.find('strong'=='Location:'):
                    text = p.text
                    #print(text)
                    text = text[10:]
                    location.append(text)
            #print(location)
            
            #appending JSI to organization 
            organization.append('JSI')

DataFrame = pd.DataFrame() 
DataFrame['Page Url']= page_url 
DataFrame['Job']= job_position 
DataFrame['Location'] = location
DataFrame['ImmEcs'] = ImmEcs
DataFrame['Description'] = description 
DataFrame['Requirements']= requirements 
DataFrame['Organization'] = organization

Data = DataFrame.drop_duplicates() 

Data.to_csv("JSI_Data.csv")

print('Webscraping complete')