In [1]:
# %% Imports

import pandas as pd
import bs4
from bs4 import BeautifulSoup as bs
import requests
import csv

In [2]:
# %% Setup version

print('pandas version: {}'.format(pd.__version__))
print('bs4 version: {}'.format(bs4.__version__))
print('requests version: {}'.format(requests.__version__))
print('csv version: {}'.format(csv.__version__))

pandas version: 1.4.2
bs4 version: 4.11.1
requests version: 2.27.1
csv version: 1.0


In [3]:
def get_url(search_term:str, nb_pages:int):
    
    """
    The get_url function returns a list of urls for the searched phrase and the number of pages
    
    :search_term: The name or expression of the job you're looking for 
    :nb_pages: The number of pages you want to scrape (< maximum number of web pages shown)
    
    """
    
    links=[]
    search_term= search_term.replace(' ','%20')
    template= 'https://www.salary.com/tools/salary-calculator/search?keyword={}&location=&page={}&selectedjobcodes='
    for page in range(1, nb_pages):
        links.append(template.format(search_term, page))
        
    return links


In [4]:
def scrape_desc(template):
    
    link= template + '-job-description?isshowmore=more&statistics=0'
    page= requests.get(link)
    soup= bs(page.content, 'html.parser')
    p= soup.find('p', class_='sal-p')
    description= p.text.strip()
    
    return description


In [5]:

headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

def scrape_salary(link):
    
    """
    
    Scrape_salary does scrape the salary percentiles of a particualar job
    
    :link: the URL of salary's web page 
    
    """

    page= requests.get(link, headers= headers)
    soup= bs(page.content, 'html.parser')
    div= soup.find('table', class_='table-chart')
    
    tds= div.find_all('td')
    percentile10= tds[5].text
    percentile25= tds[9].text
    percentile50= tds[13].text
    percentile75= tds[17].text
    percentile90= tds[21].text
    
    
    return [percentile10, percentile25, percentile50, percentile75, percentile90]


In [6]:
def scrape_all_links(links):
    
    """
    
    The scrape_all_links function loops over the urls and scrapes all jobs infos    
    
    :links: list of urls generated by calling the get_url function
    
    """
    records= []
    
    for link in links:
        page= requests.get(link)
        soup= bs(page.content, 'html.parser')
        divs= soup.find_all('div', class_='sal-popluar-skills margin-top30')
        
        for div in divs:
            title = div.find('a', class_='a-color font-semibold margin-right10').text
            link= div.a['href']
            start= link.find('https://www.salary.com/tools/salary-calculator/')
            end= link.find("'", start)
            link= link[start:end]
            
            salaries= scrape_salary(link)
            description= scrape_desc(link)
            record= [title, description, link] + salaries
            records.append(record)
            
    with open("data/salary_data.csv", 'w', newline='', encoding='utf-8') as f:
        writer= csv.writer(f)
        writer.writerow(['Title', 'Description', 'link', 'Percentile10', 'Percentile25', 'Percentile50', 'Percentile75', 'Percentile90'])
        writer.writerows(records)
            
    return records


In [7]:
# To scrape the salaries of project managers

links= get_url('project manager', 2)
records= scrape_all_links(links)

In [8]:
# Let's check the results

df= pd.read_csv('data/salary_data.csv')
df.head()

Unnamed: 0,Title,Description,link,Percentile10,Percentile25,Percentile50,Percentile75,Percentile90
0,Project Manager - Construction,Project Manager - Construction oversees and di...,https://www.salary.com/tools/salary-calculator...,"$84,577","$97,670","$112,050","$126,274","$139,224"
1,Project Accounting Manager,Project Accounting Manager manages a team of p...,https://www.salary.com/tools/salary-calculator...,"$86,153","$104,751","$125,179","$148,657","$170,032"
2,IT Project Manager IV,IT Project Manager IV manages and oversees all...,https://www.salary.com/tools/salary-calculator...,"$110,759","$121,122","$132,504","$143,264","$153,060"
3,Project Controls Manager,Project Controls Manager manages and oversees ...,https://www.salary.com/tools/salary-calculator...,"$108,483","$127,458","$148,301","$163,739","$177,795"
4,Project Manager Sr. - Construction,Project Manager Sr. - Construction is responsi...,https://www.salary.com/tools/salary-calculator...,"$118,505","$136,352","$155,955","$174,966","$192,274"


In [10]:
df.shape

(10, 8)