# Salary.com Scraper

In [1]:
import re
import csv
import json
from time import sleep
from bs4 import BeautifulSoup
import requests

In [3]:
template = 'https://www.salary.com/research/salary/benchmark/{}-salary/{}'

In [4]:
# build the url based on search criteria
position = 'business-data-analyst-ii'
location = 'austin-tx'

url = template.format(position, location)

# request the raw html
response = requests.get(url)
response

<Response [200]>

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

### Inspect the data extraction options, look for html or json formatted data, then extract the script

In [7]:
pattern = re.compile(r'Occupation')
script = soup.find('script', {"type":"application/ld+json"}, text=pattern)
script

<script type="application/ld+json">
 {
    "@context": "http://schema.org",
    "@type": "Occupation",
    "name": "Business Data Analyst II",
    "mainEntityOfPage": {
        "@type": "WebPage",
        "lastReviewed": "2021-02-26T00:00:00Z"
    },
    "description": "Business Data Analyst II performs business analysis using various techniques, e.g. statistical analysis, explanatory and predictive modeling, data mining. Determines best practices and develops actionable insights and recommendations for the current business operations. Being a Business Data Analyst II works directly with the internal or external client to identify analytical requirements. May help to produce ad hoc data and reports. Additionally, Business Data Analyst II may assist in implementing or developing systems to capture business operation information. May occasionally guide less experienced business data analysts. Requires a bachelor&#39;s degree. Typically reports to a supervisor or manager. The Business Dat

### Identify the relevant data and extract the json data

In [11]:
json_raw = script.contents[0]

### Convert the json data

In [12]:
json_data = json.loads(json_raw)

In [13]:
json_data

{'@context': 'http://schema.org',
 '@type': 'Occupation',
 'name': 'Business Data Analyst II',
 'mainEntityOfPage': {'@type': 'WebPage',
  'lastReviewed': '2021-02-26T00:00:00Z'},
 'description': 'Business Data Analyst II performs business analysis using various techniques, e.g. statistical analysis, explanatory and predictive modeling, data mining. Determines best practices and develops actionable insights and recommendations for the current business operations. Being a Business Data Analyst II works directly with the internal or external client to identify analytical requirements. May help to produce ad hoc data and reports. Additionally, Business Data Analyst II may assist in implementing or developing systems to capture business operation information. May occasionally guide less experienced business data analysts. Requires a bachelor&#39;s degree. Typically reports to a supervisor or manager. The Business Data Analyst II gains exposure to some of the complex tasks within the job fu

### Extracting the data
If you look closely, you'll notice that there are two sets of compensation statistics, one is the base salary, and the other is total compensation. The total compensation will include bonuses and other benefits. For this example, I'm just going to grab the base salary.

In [14]:
job_title = json_data['name']
location = json_data['occupationLocation'][0]['name']
description = json_data['description']

ntile_10 = json_data['estimatedSalary'][0]['percentile10']
ntile_25 = json_data['estimatedSalary'][0]['percentile25']
ntile_50 = json_data['estimatedSalary'][0]['median']
ntile_75 = json_data['estimatedSalary'][0]['percentile75']
ntile_90 = json_data['estimatedSalary'][0]['percentile90']

salary_data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
print(salary_data)

('Business Data Analyst II', 'Austin, TX', 'Business Data Analyst II performs business analysis using various techniques, e.g. statistical analysis, explanatory and predictive modeling, data mining. Determines best practices and develops actionable insights and recommendations for the current business operations. Being a Business Data Analyst II works directly with the internal or external client to identify analytical requirements. May help to produce ad hoc data and reports. Additionally, Business Data Analyst II may assist in implementing or developing systems to capture business operation information. May occasionally guide less experienced business data analysts. Requires a bachelor&#39;s degree. Typically reports to a supervisor or manager. The Business Data Analyst II gains exposure to some of the complex tasks within the job function. Occasionally directed in several aspects of the work. To be a Business Data Analyst II typically requires 2 to 4 years of related experience.', '

### Expanding your data set

In [26]:
def extract_salary_info(job_position, job_location):
    """Extract and return salary information"""
    
    template = 'https://www.salary.com/research/salary/benchmark/{}-salary/{}'
 
    # build the url based on search criteria
    url = template.format(job_position, job_location)

    # request the raw html .. check for valid request
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return None
    except requests.exceptions.ConnectionError:
        return None
   
    # parse the html and extract json data
    soup = BeautifulSoup(response.text, 'html.parser')
    pattern = re.compile(r'Occupation')
    script = soup.find('script', {'type': 'application/ld+json'}, text=pattern)
    json_raw = script.contents[0]
    json_data = json.loads(json_raw)

    # extract salary data
    job_title = json_data['name']
    location = json_data['occupationLocation'][0]['name']
    description = json_data['description']

    ntile_10 = json_data['estimatedSalary'][0]['percentile10']
    ntile_25 = json_data['estimatedSalary'][0]['percentile25']
    ntile_50 = json_data['estimatedSalary'][0]['median']
    ntile_75 = json_data['estimatedSalary'][0]['percentile75']
    ntile_90 = json_data['estimatedSalary'][0]['percentile90']

    data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
    return data

let's import a list of cities. 

In [35]:
with open('largest_cities.csv', newline='',mode='r') as f:
    reader = csv.reader(f)
    # a reader typically returns each row as a list... so I need to flatten the list to make a single list
    cities = [city for row in reader for city in row]

In [38]:
print(cities[:25])

['New-York-NY', 'Los-Angeles-CA', 'Chicago-IL', 'Houston-TX', 'Phoenix-AZ', 'Philadelphia-PA', 'San-Antonio-TX', 'San-Diego-CA', 'Dallas-TX', 'San-Jose-CA', 'Austin-TX', 'Jacksonville-FL', 'Fort-Worth-TX', 'Columbus-OH', 'Charlotte-NC', 'San-Francisco-CA', 'Indianapolis-IN', 'Seattle-WA', 'Denver-CO', 'Washington-DC', 'Boston-MA', 'El-Paso-TX', 'Nashville-TN', 'Detroit-MI', 'Oklahoma-City-OK']


### Getting all city data

In [46]:
salary_data = []

for city in cities[:5]:
    result = extract_salary_info('business-data-analyst-ii', city)
    if result:
        salary_data.append(result)
        sleep(0.5)

### Save the data to csv

In [44]:
with open('salary-results.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
    writer.writerows(salary_data)

In [45]:
# print the first 5 records
for row in salary_data[:5]:
    print(row)

('Business Data Analyst II', 'New York, NY', 'Business Data Analyst II performs business analysis using various techniques, e.g. statistical analysis, explanatory and predictive modeling, data mining. Determines best practices and develops actionable insights and recommendations for the current business operations. Being a Business Data Analyst II works directly with the internal or external client to identify analytical requirements. May help to produce ad hoc data and reports. Additionally, Business Data Analyst II may assist in implementing or developing systems to capture business operation information. May occasionally guide less experienced business data analysts. Requires a bachelor&#39;s degree. Typically reports to a supervisor or manager. The Business Data Analyst II gains exposure to some of the complex tasks within the job function. Occasionally directed in several aspects of the work. To be a Business Data Analyst II typically requires 2 to 4 years of related experience.',

### Consolidate into main function

In [47]:
def main(job_position):
    """Extract salary data from top us cities"""
    
    # get the list of largest us cities
    with open('largest_cities.csv', newline='') as f:
        reader = csv.reader(f)
        # a reader typically returns each row as a list... so I need to flatten the list to make a single list
        cities = [city for row in reader for city in row]
        
    # extract salary data for each city
    salary_data = []
    for city in cities[:30]:
        result = extract_salary_info(job_position, city)
        if result:
            salary_data.append(result)
            sleep(0.5)
            
    # save data to csv file
    with open('salary-results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
        writer.writerows(salary_data)
        
    return salary_data

In [49]:
main('business-data-analyst-ii')

[('Business Data Analyst II',
  'New York, NY',
  'Business Data Analyst II performs business analysis using various techniques, e.g. statistical analysis, explanatory and predictive modeling, data mining. Determines best practices and develops actionable insights and recommendations for the current business operations. Being a Business Data Analyst II works directly with the internal or external client to identify analytical requirements. May help to produce ad hoc data and reports. Additionally, Business Data Analyst II may assist in implementing or developing systems to capture business operation information. May occasionally guide less experienced business data analysts. Requires a bachelor&#39;s degree. Typically reports to a supervisor or manager. The Business Data Analyst II gains exposure to some of the complex tasks within the job function. Occasionally directed in several aspects of the work. To be a Business Data Analyst II typically requires 2 to 4 years of related experien