In [1]:
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from random import random
import time
import re
# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    #from torrequest import TorRequest
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]
    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]
    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=1&sort=date'
                
    position = position.replace(' ', '%20')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url


# from torrequest import TorRequest
# tr=TorRequest(password='your_super_secure_password')
position = 'data scientist'
location = 'california'
# tr.reset_identity()
response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
response

<Response [200]>

In [3]:

soup = BeautifulSoup(response.text, 'html.parser')
#soup


In [4]:
def get_desc_features(job_url):
    """Parses each job description, searching for and extracting values for features

    Args:
        job_url (string): http address of each job posting

    Returns:
        tuple: job feature values
    """
    response_job_desc = requests.get(job_url)
    soup = BeautifulSoup(response_job_desc.text, 'html.parser')

    
    try:
        salary_and_jType = soup.find('div', id='salaryInfoAndJobType').text.strip()
    except:
        salary_and_jType = None
    if salary_and_jType == None:
        try:
            salary_and_jType = soup.find('div',id="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            salary_and_jType = None
    #TODO get benefits from its designated section
    
    
    try:
        sal_guide_items = []
        items = soup.find('ul',class_='css-1lyr5hv eu4oa1w0')
        for i in items:
            sal_guide_items.append(i.text)
    except:
        sal_guide_items = None
        
        
    try:
        salfromsection = soup.find('span',class_='icl-u-xs-mr--xs').text
    except:
        salfromsection = None
        
        
    try:
        job_type_items = []
        job_type_from_section = soup.find('div',class_='jobsearch-JobDescriptionSection-sectionItem').next_sibling.children
        for i in job_type_from_section:
            if i.text == 'Job Type':
                continue
            else:
                job_type_items.append(i.text)
    except:
        job_type_items = None
        
    
    try:
        requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()      

    except:
        requirements = None
        
        
    try:
        description = soup.find(id="jobDescriptionText").text.replace('\n', '')
    except:
        description = None
        
        
    # A nifty little workaround for evading detection.
    time.sleep(.5+random()*3)
    #TODO assess h2 tags commonalities to determine if these section descriptions are from Indeed or are at least of only a few variations.
        #you could then distinguish the description into sections and conduct NLP etc each.
    raw_desc_soup = soup
    return salary_and_jType, sal_guide_items, salfromsection, job_type_items, requirements, description, raw_desc_soup

In [5]:
#TODO condense these with lists, particularly fields that have .text.strip()
def get_features(post):
    """parses search results and extracts basic job feature values,
        then combines this with output of 'get_desc_features' function.

    Args:
        post (string): response for each post in search results page

    Returns:
        dict: single-feature deep dictionary of features (dictionary keys) and their values (dictionary values)
    """
    datapoint_dict = {}

    title = post.find('h2',
              attrs={'class': lambda e: e.startswith('jobTitle') if e else False}).text.replace('new', '')

    company = post.find('span', 'companyName').text.strip()
    try:
        rating = post.find('span', 'ratingNumber').text
    except:
        rating = None

    location = post.find('div', 'companyLocation').text.strip()
    postDate = post.find('span', 'date').text
    extractDate = datetime.today().strftime('%Y-%m-%d')
    summary = post.find('div', 'job-snippet').text.strip().replace('\n', ' ')
    url = 'https://www.indeed.com'+ post.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']

    try:
        estimated_salary = post.find('span','estimated-salary').text.strip()
    except:
        estimated_salary = None
    try:
        salary = post.find('div','metadata salary-snippet-container').text.strip()
    except:
        salary = None


        
            
    salary_and_jType, sal_guide_items, salfromsection, job_type_items, requirements, description, raw_desc_soup = get_desc_features(url)
    datapoint_dict = {
                        'title':title,
                        'company':company,
                        'rating':rating,
                        'location':location,
                        'salary':salary,
                        'estimated_salary':estimated_salary,
                        'postDate':postDate,
                        'extractDate':extractDate,
                        'summary':summary,
                        'url':url,
                        'salary_and_jType':salary_and_jType,
                        'sal_guide_items':sal_guide_items,
                        'salfromsection':salfromsection,
                        'job_type_items':job_type_items,
                        'requirements':requirements,
                        'description':description,
                        'raw_desc_soup':raw_desc_soup}
    if len(datapoint_dict) > 0:
        return datapoint_dict
    else:
        pass

In [6]:
def main(position, location):
    """[Conducts the web scraping process]
    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = pd.DataFrame()
    
    # extract the job data
    while True:
        response = requests.get(get_URL(position, location))
        soup = BeautifulSoup(response.text, 'html.parser')
        searchResults = soup.find('div', id='mosaic-provider-jobcards')
        refinedsearchResults = searchResults.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
        

        raw_posts = []
        for post in refinedsearchResults:
            raw_posts.append(post)
        
        n = 0
        for post in raw_posts:
            datapoint = get_features(post)
            data = data.append(datapoint, ignore_index=True)
        # Again, a nifty little workaround for evading detection.
            n+=1
            print(n)
            
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except:
            break

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    day = date.today()
    # save the job data
    data.to_csv(f'../app/data/scraped_{name}_{loc}.csv', index=False)
    return 

In [8]:
#state_names = [ "alabama", "arkansas",  "arizona",  "colorado", "connecticut", "delaware",  "georgia", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada",  "ohio", "oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee", "utah", "virginia",  "vermont",  "wisconsin", "west virginia", "wyoming"]

#state_names = [ "alabama", "arkansas",  "arizona",  "colorado", "connecticut", "delaware",  "georgia", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada",  "ohio", "oklahoma", "oregon"]
state_names = ["south dakota", "tennessee", "utah", "virginia",  "vermont",  "wisconsin", "west virginia", "wyoming"]

for state in state_names:
    position = 'data scientist'
    location = state
    print(state)
    data = main(position,location )

south dakota


AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
position = 'data scientist'
location = 'california'
main(position,location )

In [None]:
position = 'data scientist'
location = 'remote'
main(position,location )

In [None]:
position = 'data scientist'
location = 'new york'
main(position,location )

In [None]:
position = 'data scientist'
location = 'texas'
main(position,location )

In [None]:
position = 'data scientist'
location = 'washington'
main(position,location )

In [None]:
position = 'data scientist'
location = 'florida'
main(position,location )

In [None]:
position = 'data scientist'
location = 'massachusetts'
main(position,location )

In [None]:
position = 'data scientist'
location = 'oregon'
main(position,location )

The  below is used for various adjustments to my webscraping process.

In [9]:
pd.read_csv(f'../app/data/scraped_data_scientist_oregon.csv')

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Fisher Investments,Are you highly analytical and looking for an o...,Estimated $56.7K - $71.9K a year,2022-06-22,,"Portland, OR 97204+1 location",PostedToday,3.5,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$56.7K - $71...",,,,Be a part of our data modernization efforts th...,Data Science Associate,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,Recruiting From Scratch,Who is Recruiting from Scratch: Recruiting fro...,,2022-06-22,,"Remote in Portland, OR 97035",PostedToday,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"$160,000 - $200,000 a year","$160,000 - $200,000 a year","$160,000 - $200,000 a year",5+ years experience in data engineering or dat...,Data Scientist,https://www.indeed.com/rc/clk?jk=0921a823554b3...


### Concatenating Old Data With New

In [15]:
state_names = [ "alabama", "arkansas",  "arizona", "california", "colorado", "connecticut", "delaware", "florida", "georgia", "remote", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada", "new york", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee", "texas", "utah", "virginia",  "vermont", "washington", "wisconsin", "west virginia", "wyoming"]

lst = []
records = 0
for i in state_names:
    i = i.replace(' ','_')
    try:
        location = pd.read_csv(f'../app/data/scraped_data_scientist_{i}.csv')
        records += len(location)
        lst.append(location)
        print(f'Scraped  {len(location)} new records for {i}')
    except:
        continue

print(f'Scraped Records: {(records)}')

todays_scrape = pd.concat(lst)
## unblock if you miss too many days
#total = todays_scrape.to_csv('../app/data/total.csv', index= False)

total = pd.read_csv(f'../app/data/total.csv')
lst.append(total)

total = pd.concat(lst)

total.drop_duplicates(inplace=True)

total.to_csv('../app/data/total.csv', index= False)
print(f'Total Records: {(len(total))}')
total

Scraped  7 new records for alabama
Scraped  1 new records for arkansas
Scraped  10 new records for arizona
Scraped  60 new records for california
Scraped  10 new records for colorado
Scraped  30 new records for connecticut
Scraped  7 new records for delaware
Scraped  15 new records for florida
Scraped  45 new records for georgia
Scraped  15 new records for remote
Scraped  3 new records for iowa
Scraped  1 new records for idaho
Scraped  30 new records for illinois
Scraped  10 new records for indiana
Scraped  6 new records for kansas
Scraped  2 new records for kentucky
Scraped  5 new records for louisiana
Scraped  120 new records for massachusetts
Scraped  12 new records for maryland
Scraped  1 new records for maine
Scraped  7 new records for michigan
Scraped  9 new records for minnesota
Scraped  3 new records for missouri
Scraped  1 new records for mississippi
Scraped  1 new records for montana
Scraped  13 new records for north_carolina
Scraped  1 new records for north_dakota
Scraped  1

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Recruiting From Scratch,Who is Recruiting from Scratch: Recruiting fro...,,2022-06-21,,"Remote in Huntsville, AL",PostedToday,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"$160,000 - $200,000 a year","$160,000 - $200,000 a year","$160,000 - $200,000 a year",5+ years experience in data engineering or dat...,Data Scientist,https://www.indeed.com/rc/clk?jk=977d1ccb6d1a1...
1,Food Management Search,SR Financial Data Analyst- Fully remote - ANY ...,,2022-06-21,,Remote in United States,PostedToday,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,,,,Lead various allocation data analysis projects...,SR Financial Data Analyst- Fully remote,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,Gregor Diagnostics,Role: Senior Data ScientistAbout Gregor Diagno...,,2022-06-21,,Remote in United States,PostedPosted 1 day ago,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,,Full-time,,The Senior Data Scientist will work closely la...,Senior Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
3,SynergisticIT,"At SynergisticIT, we aim to bring aboard IT ...",Estimated $77.7K - $98.3K a year,2022-06-21,,Alabama,PostedPosted 1 day ago,4.2,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$77.7K - $98...",,"Full-time, Contract",,"Collaborate with dynamic teams of engineers, d...",Entry Level Data Scientist,https://www.indeed.com/rc/clk?jk=57d47b0524890...
4,9Rooftops,WE ARE HIRING IN MULTIPLE LOCATIONS ACROSS THE...,Estimated $109K - $138K a year,2022-06-21,,"Birmingham, AL 35242",PostedPosted 1 day ago,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$109K - $138...",,,,O Communication skills to ask questions of cli...,Statistician/Data Scientist,https://www.indeed.com/rc/clk?jk=e23726e90096e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2986,NATIONAL GRID CO USA (NE POWER),,,2022-06-21,,"Waltham, MA 02454",PostedToday,3.7,"<html lang=""en"">\n<head>\n<title>hCaptcha solv...",,,,,,"Employ sophisticated analytics, machine learni...","Lead Data Scientist, Data Science",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2987,PAREXEL,,,2022-06-21,,Massachusetts,PostedToday,3.6,,,,,,,The Senior Statistical Programmer provides tec...,Senior Statistical Programmer FSP (Poland),https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2988,Recruiting From Scratch,Who is Recruiting from Scratch: Recruiting fro...,,2022-06-21,,"Remote in Boston, MA 02109+3 locations",PostedToday,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"$160,000 - $200,000 a year","$160,000 - $200,000 a year","$160,000 - $200,000 a year",5+ years experience in data engineering or dat...,Data Scientist,https://www.indeed.com/rc/clk?jk=89b4714ca9540...
3067,Recruiting From Scratch,Who is Recruiting from Scratch: Recruiting fro...,,2022-06-21,,"Remote in Portland, OR 97035",PostedToday,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,"$160,000 - $200,000 a year","$160,000 - $200,000 a year","$160,000 - $200,000 a year",5+ years experience in data engineering or dat...,Data Scientist,https://www.indeed.com/rc/clk?jk=0921a823554b3...


In [13]:
#TODO explain why this is being shown. remove from data and save, but also show what portion of the data it represents. Relatively miniscule.
total[total.description.isna()].location.value_counts()

Chicago, IL           12
United States         11
Indiana                6
Massachusetts          6
Phoenix, AZ            6
Cincinnati, OH         5
Salt Lake City, UT     3
Virginia               3
Waltham, MA 02454      3
Name: location, dtype: int64

The below are bits of code I employ if something goes wrong with the webscraping process.

In [14]:
"""#fix old imports

data['extractDate']= pd.to_datetime(data['extractDate'])

def pDate(row):
    from datetime import datetime, date, timedelta

    #days_ago = row['dateposted']
    delta = timedelta(0)
    try:
        return row['extractDate'] - delta
    except:
        return row

data['extractDate'] = data.apply( lambda row : pDate(row), axis = 1)
data['extractDate'] = data['extractDate'].astype(str)
#data.to_csv('../app/data/scraped_data_scientist_remote_2022-04-14.csv', index= False)
data.extractDate.unique()"""

"#fix old imports\n\ndata['extractDate']= pd.to_datetime(data['extractDate'])\n\ndef pDate(row):\n    from datetime import datetime, date, timedelta\n\n    #days_ago = row['dateposted']\n    delta = timedelta(0)\n    try:\n        return row['extractDate'] - delta\n    except:\n        return row\n\ndata['extractDate'] = data.apply( lambda row : pDate(row), axis = 1)\ndata['extractDate'] = data['extractDate'].astype(str)\n#data.to_csv('../app/data/scraped_data_scientist_remote_2022-04-14.csv', index= False)\ndata.extractDate.unique()"

In [None]:
"""# codescraps in case they change the html and break my parsers

searchResults = soup.find('div', id='mosaic-provider-jobcards')
refinedsearchResults = searchResults.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
len(refinedsearchResults)
z = searchResults.children
lst = []
for i in z:
    lst.append(i)

x = lst[0]

#checking again
soup = BeautifulSoup(response.text, 'html.parser')
#searchResults = soup.find('div', id='mosaic-provider-jobcards')
refinedsearchResults = soup.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
        

raw_posts = []
for post in refinedsearchResults:
        raw_posts.append(post)
        n = 0

z = raw_posts[0]
url = z.find('a', href=True)
url


z.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']
z

postDate = z.find('span', 'date').text
extractDate = datetime.today().strftime('%Y-%m-%d')
summary = z.find('div', 'job-snippet').text.strip().replace('\n', ' ')

summary

company_name = z.find('a', attrs={'class':'turnstileLink companyOverviewLink'}).text.strip()
company_name

job_title = z.find('a', attrs={'class':'jcs-JobTitle'}).text.strip()
job_title

"""