In [68]:
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from random import random
import time
import re
# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings
warnings.filterwarnings("ignore")

In [69]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    #from torrequest import TorRequest
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]
    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]
    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=1&sort=date'
                
    position = position.replace(' ', '%20')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url


# from torrequest import TorRequest
# tr=TorRequest(password='your_super_secure_password')
position = 'data scientist'
location = 'california'
# tr.reset_identity()
response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
response

<Response [200]>

In [70]:

soup = BeautifulSoup(response.text, 'html.parser')
#soup


In [71]:
def get_desc_features(job_url):
    """Parses each job description, searching for and extracting values for features

    Args:
        job_url (string): http address of each job posting

    Returns:
        tuple: job feature values
    """
    response_job_desc = requests.get(job_url)
    soup = BeautifulSoup(response_job_desc.text, 'html.parser')

    
    try:
        salary_and_job_type = soup.find('div', id='salaryInfoAndJobType').text.strip()
    except:
        salary_and_job_type = None
    if salary_and_job_type == None:
        try:
            salary_and_job_type = soup.find('div',id="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            salary_and_job_type = None
    #TODO get benefits from its designated section
    
    
    try:
        salary_guidance = []
        items = soup.find('ul',class_='css-1lyr5hv eu4oa1w0')
        for i in items:
            salary_guidance.append(i.text)
    except:
        salary_guidance = None
        
        
    try:
        job_type_items = []
        job_type_from_section = soup.find('div',class_='jobsearch-JobDescriptionSection-sectionItem').next_sibling.children
        for i in job_type_from_section:
            if i.text == 'Job Type':
                continue
            else:
                job_type_items.append(i.text)
    except:
        job_type_items = None
        
    
    try:
        requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()      

    except:
        requirements = None
        
        
    try:
        description = soup.find(id="jobDescriptionText").text.replace('\n', '')
    except:
        description = None
        
        
    # A nifty little workaround for evading detection.
    time.sleep(.5+random()*3)
    #TODO assess h2 tags commonalities to determine if these section descriptions are from Indeed or are at least of only a few variations.
        #you could then distinguish the description into sections and conduct NLP etc each.
    raw_desc_soup = soup
    return salary_and_job_type, salary_guidance, job_type_items, requirements, description, raw_desc_soup

In [72]:
#TODO condense these with lists, particularly fields that have .text.strip()
def get_features(post):
    """parses search results and extracts basic job feature values,
        then combines this with output of 'get_desc_features' function.

    Args:
        post (string): response for each post in search results page

    Returns:
        dict: single-feature deep dictionary of features (dictionary keys) and their values (dictionary values)
    """
    datapoint_dict = {}

    title = post.find('h2',
              attrs={'class': lambda e: e.startswith('jobTitle') if e else False}).text.replace('new', '')

    company = post.find('span', 'companyName').text.strip()
    try:
        rating = post.find('span', 'ratingNumber').text
    except:
        rating = None

    location = post.find('div', 'companyLocation').text.strip()
    postDate = post.find('span', 'date').text
    extractDate = datetime.today().strftime('%Y-%m-%d')
    summary = post.find('div', 'job-snippet').text.strip().replace('\n', ' ')
    url = 'https://www.indeed.com'+ post.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']


        
            
    salary, estimated_salary, job_type_items, requirements, description, raw_desc_soup = get_desc_features(url)
    datapoint_dict = {
                        'title':title,
                        'company':company,
                        'rating':rating,
                        'location':location,
                        'estimated_salary':estimated_salary,
                        'postDate':postDate,
                        'extractDate':extractDate,
                        'summary':summary,
                        'url':url,
                        'salary':salary,
                        'job_type_items':job_type_items,
                        'requirements':requirements,
                        'description':description,
                        'raw_desc_soup':raw_desc_soup}
    if len(datapoint_dict) > 0:
        return datapoint_dict
    else:
        pass

In [84]:
def main(position, location):
    """[Conducts the web scraping process]
    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = pd.DataFrame()
    
    # extract the job data
    run = True
    try:
        response = requests.get(get_URL(position, location))
        soup = BeautifulSoup(response.text, 'html.parser')
        searchResults = soup.find('div', id='mosaic-provider-jobcards')
        refinedsearchResults = searchResults.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
        

        raw_posts = []
        for post in refinedsearchResults:
            raw_posts.append(post)
        
        n = 0
        for post in raw_posts:
            datapoint = get_features(post)
            data = data.append(datapoint, ignore_index=True)
        # Again, a nifty little workaround for evading detection.
            n+=1
            if n%15==0:
                print('New Page')
            else:
                continue
        
            
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except:
            pass
    except:
        pass
        

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    day = date.today()
    # save the job data
    data.to_csv(f'../app/data/scraped_{name}_{loc}.csv', index=False)
    
    return f'Scraped {len(data)} new records.'

In [85]:
#state_names = [ "alabama", "arkansas",  "arizona",  "colorado", "connecticut", "delaware",  "georgia", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada",  "ohio", "oklahoma", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee", "utah", "virginia",  "vermont",  "wisconsin", "west virginia", "wyoming"]

state_names = [  "louisiana", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada",  "ohio", "oklahoma", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee", "utah", "virginia",  "vermont",  "wisconsin", "west virginia", "wyoming"]

for state in state_names:
    position = 'data scientist'
    location = state
    print(state)
    data = main(position,location )

louisiana
maryland
maine
michigan
minnesota
missouri
mississippi
montana
north carolina
north dakota
nebraska
new hampshire
new jersey
new mexico
nevada
ohio
oklahoma
pennsylvania
rhode island
south carolina
south dakota
tennessee
utah
virginia
vermont
wisconsin
west virginia
wyoming


In [86]:
position = 'data scientist'
location = 'california'
main(position,location )

'Scraped 3 new records.'

In [88]:
position = 'data scientist'
location = 'remote'
main(position,location )

New Page


'Scraped 15 new records.'

In [89]:
position = 'data scientist'
location = 'new york'
main(position,location )

New Page


'Scraped 15 new records.'

In [90]:
position = 'data scientist'
location = 'texas'
main(position,location )

New Page


'Scraped 15 new records.'

In [91]:
position = 'data scientist'
location = 'washington'
main(position,location )

'Scraped 9 new records.'

In [92]:
position = 'data scientist'
location = 'florida'
main(position,location )

'Scraped 10 new records.'

In [93]:
position = 'data scientist'
location = 'massachusetts'
main(position,location )

New Page


'Scraped 15 new records.'

In [94]:
position = 'data scientist'
location = 'oregon'
main(position,location )

'Scraped 1 new records.'

The  below is used for various adjustments to my webscraping process.

### Concatenating Data With New

In [97]:
state_names = ['alabama', 'arkansas', 'arizona', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'remote', 'iowa', 'idaho', 'illinois', 'indiana', 'kansas', 'kentucky', 'louisiana', 'massachusetts', 'maryland', 'maine', 'michigan', 'minnesota', 'missouri', 'mississippi', 'montana', 'north carolina', 'north dakota', 'nebraska', 'new hampshire', 'new jersey', 'new mexico', 'nevada', 'new york', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas', 'utah', 'virginia',  'vermont', 'washington', 'wisconsin', 'west virginia', 'wyoming']

lst = []
records = 0
for i in state_names:
    i = i.replace(' ','_')
    try:
        location = pd.read_csv(f'../app/data/scraped_data_scientist_{i}.csv')
        records += len(location)
        lst.append(location)
        print(f'Scraped  {len(location)} new records for {i}')
    except:
        continue

print(f'Scraped Records: {(records)}')

todays_scrape = pd.concat(lst)
## unblock if you miss too many days
#total = todays_scrape.to_csv('../app/data/total.csv', index= False)

total = pd.read_csv(f'../app/data/total.csv')
lst.append(total)

total = pd.concat(lst)

total.drop_duplicates(inplace=True)

total.to_csv('../app/data/total.csv', index= False)
print(f'Total Records: {(len(total))}')
total

Scraped  2 new records for alabama
Scraped  3 new records for arkansas
Scraped  2 new records for arizona
Scraped  3 new records for california
Scraped  5 new records for colorado
Scraped  4 new records for connecticut
Scraped  3 new records for delaware
Scraped  10 new records for florida
Scraped  6 new records for georgia
Scraped  15 new records for remote
Scraped  1 new records for iowa
Scraped  1 new records for idaho
Scraped  6 new records for illinois
Scraped  3 new records for indiana
Scraped  2 new records for kansas
Scraped  4 new records for kentucky
Scraped  1 new records for louisiana
Scraped  15 new records for massachusetts
Scraped  5 new records for maryland
Scraped  1 new records for michigan
Scraped  3 new records for minnesota
Scraped  1 new records for missouri
Scraped  11 new records for north_carolina
Scraped  1 new records for nebraska
Scraped  11 new records for new_jersey
Scraped  1 new records for new_mexico
Scraped  15 new records for new_york
Scraped  8 new r

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,salary,summary,title,url
0,Spire Inc.,Summary Spire is hiring a Data Scientist in Bi...,"['', 'Not provided by employer', ""$103K - $130...",2022-06-26,,"Hybrid remote in Birmingham, AL 35203",PostedPosted 1 day ago,3.3,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,This role will have a direct impact on transfo...,Data Scientist,https://www.indeed.com/rc/clk?jk=137727a016a51...
1,COLSA,"General Summary Designs, develops and analyzes...","['', 'Not provided by employer', ""$78.9K - $99...",2022-06-26,,"Huntsville, AL",PostedPosted 1 day ago,3.9,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,Full-time,Advises hardware design engineers on machine c...,Artificial Intelligence Developer,https://www.indeed.com/rc/clk?jk=b9e5801c8446f...
0,Walmart,Position Summary...What you'll do...About Proj...,,2022-06-25,,"Remote in Bentonville, AR 72712",PostedPosted 1 day ago,3.4,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"Bachelor's degree in Statistics, Economics, An...",Data Scientist,https://www.indeed.com/rc/clk?jk=55113e2916c5a...
1,Walmart,Position Summary...What you'll do...Who We Are...,,2022-06-25,,"Bentonville, AR 72712",PostedPosted 1 day ago,3.4,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,Perform data-wrangling activities to ensure qu...,Senior Data Scientist,https://www.indeed.com/rc/clk?jk=d4bb530920a8f...
2,Change Healthcare,Company: Change Healthcare (NASDAQ: CHNG) is a...,,2022-06-25,,Arkansas,PostedPosted 1 day ago,3.0,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,Full-time,You will enjoy working with a highly talented ...,Lead Data Scientist,https://www.indeed.com/rc/clk?jk=904838a5db031...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,Sonic Foundry,The Lead Data Scientist – Video ai is respon...,"['', 'Not provided by employer', ""$116K - $147...",2022-06-24,,"Madison, WI 53703 (Capitol area)",PostedJust posted,4.2,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,Full-time,5+ years experience working with machine learn...,Lead Data Scientist - Video AI (FT),https://www.indeed.com/rc/clk?jk=d7dfd29204288...
914,Change Healthcare,Company: Change Healthcare (NASDAQ: CHNG) is a...,,2022-06-24,,Wisconsin,PostedToday,3.0,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,Full-time,You will enjoy working with a highly talented ...,Lead Data Scientist,https://www.indeed.com/rc/clk?jk=92041a1fbba49...
915,"Kadiak, LLC","Kadiak,LLC, a Koniag Government Services compa...","['', 'Not provided by employer', ""$84.9K - $10...",2022-06-24,,"Clarksburg, WV",PostedToday,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"Under the direction of the government, the suc...",Data Science,https://www.indeed.com/rc/clk?jk=80347759e1199...
916,Change Healthcare,Company: Change Healthcare (NASDAQ: CHNG) is a...,,2022-06-24,,West Virginia,PostedToday,3.0,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,Full-time,You will enjoy working with a highly talented ...,Lead Data Scientist,https://www.indeed.com/rc/clk?jk=422bdc77b407f...


In [96]:
#TODO explain why this is being shown. remove from data and save, but also show what portion of the data it represents. Relatively miniscule.
total[total.description.isna()].location.value_counts()

Austin, TX    2
Name: location, dtype: int64

The below are bits of code I employ if something goes wrong with the webscraping process.

In [None]:
"""#fix old imports

data['extractDate']= pd.to_datetime(data['extractDate'])

def pDate(row):
    from datetime import datetime, date, timedelta

    #days_ago = row['dateposted']
    delta = timedelta(0)
    try:
        return row['extractDate'] - delta
    except:
        return row

data['extractDate'] = data.apply( lambda row : pDate(row), axis = 1)
data['extractDate'] = data['extractDate'].astype(str)
#data.to_csv('../app/data/scraped_data_scientist_remote_2022-04-14.csv', index= False)
data.extractDate.unique()"""

In [None]:
"""# codescraps in case they change the html and break my parsers

searchResults = soup.find('div', id='mosaic-provider-jobcards')
refinedsearchResults = searchResults.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
len(refinedsearchResults)
z = searchResults.children
lst = []
for i in z:
    lst.append(i)

x = lst[0]

#checking again
soup = BeautifulSoup(response.text, 'html.parser')
#searchResults = soup.find('div', id='mosaic-provider-jobcards')
refinedsearchResults = soup.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
        

raw_posts = []
for post in refinedsearchResults:
        raw_posts.append(post)
        n = 0

z = raw_posts[0]
url = z.find('a', href=True)
url


z.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']
z

postDate = z.find('span', 'date').text
extractDate = datetime.today().strftime('%Y-%m-%d')
summary = z.find('div', 'job-snippet').text.strip().replace('\n', ' ')

summary

company_name = z.find('a', attrs={'class':'turnstileLink companyOverviewLink'}).text.strip()
company_name

job_title = z.find('a', attrs={'class':'jcs-JobTitle'}).text.strip()
job_title

"""