In [1]:
import numpy as np
import pandas as pd
import re
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime, date

from random import random
from IPython.display import clear_output

# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    #from torrequest import TorRequest
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]
    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]
    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=2&sort=date'
                
    position = position.replace(' ', '%20')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url

# The below is just to check that the response is '200', successful connection
position = 'data scientist'
location = 'california'
response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
soup = BeautifulSoup(response.text, 'html.parser')
response

<Response [200]>

In [3]:
def get_desc_features(job_url):
    """Parses each job description, searching for and extracting values for features

    Args:
        job_url (string): http address of each job posting

    Returns:
        tuple: job feature values
    """
    response_job_desc = requests.get(job_url)
    soup = BeautifulSoup(response_job_desc.text, 'html.parser')

    
    try:
        salary_and_job_type = soup.find('div', id='salaryInfoAndJobType').text.strip()
    except:
        salary_and_job_type = None
    if salary_and_job_type == None:
        try:
            salary_and_job_type = soup.find('div',id="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            salary_and_job_type = None
    #TODO get benefits from its designated section
    
    
    try:
        salary_guidance = []
        items = soup.find('ul',class_='css-1lyr5hv eu4oa1w0')
        for i in items:
            salary_guidance.append(i.text)
    except:
        salary_guidance = None
        
        
    try:
        job_type_items = []
        job_type_from_section = soup.find('div',class_='jobsearch-JobDescriptionSection-sectionItem').next_sibling.children
        for i in job_type_from_section:
            if i.text == 'Job Type':
                continue
            else:
                job_type_items.append(i.text)
    except:
        job_type_items = None
        
    
    try:
        requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()      

    except:
        requirements = None
        
        
    try:
        description = soup.find(id="jobDescriptionText").text.replace('\n', '')
    except:
        description = None
        
        
    # A nifty little workaround for evading detection.
    time.sleep(.5+random()*3)
    #TODO assess h2 tags commonalities to determine if these section descriptions are from Indeed or are at least of only a few variations.
        #you could then distinguish the description into sections and conduct NLP etc each.
    raw_desc_soup = soup
    return salary_and_job_type, salary_guidance, job_type_items, requirements, description, raw_desc_soup

In [4]:
#TODO condense these with lists, particularly fields that have .text.strip()
def get_features(post):
    """parses search results and extracts basic job feature values,
        then combines this with output of 'get_desc_features' function.

    Args:
        post (string): response for each post in search results page

    Returns:
        dict: single-feature deep dictionary of features (dictionary keys) and their values (dictionary values)
    """
    #make set list of old urls
    urls = pd.read_csv('../app/data/total.csv', usecols = ['url'])
    urls.drop_duplicates(inplace=True)

    url = 'https://www.indeed.com'+ post.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']


    #check if new url is in list
    if url not in urls.url.values:
        title = post.find('h2',
                attrs={'class': lambda e: e.startswith('jobTitle') if e else False}).text.replace('new', '')

        company = post.find('span', 'companyName').text.strip()
        try:
            rating = post.find('span', 'ratingNumber').text
        except:
            rating = None

        location = post.find('div', 'companyLocation').text.strip()
        postDate = post.find('span', 'date').text
        extractDate = datetime.today().strftime('%Y-%m-%d')
        summary = post.find('div', 'job-snippet').text.strip().replace('\n', ' ')





            
        salary, estimated_salary, job_type_items, requirements, description, raw_desc_soup = get_desc_features(url)

        datapoint_dict = {}
        datapoint_dict = {
                            'title':title,
                            'company':company,
                            'rating':rating,
                            'location':location,
                            'estimated_salary':estimated_salary,
                            'postDate':postDate,
                            'extractDate':extractDate,
                            'summary':summary,
                            'url':url,
                            'salary':salary,
                            'job_type_items':job_type_items,
                            'requirements':requirements,
                            'description':description,
                            'raw_desc_soup':raw_desc_soup}
    





        return datapoint_dict
    else:
        print(' skip')

In [5]:
def main(position, location):
    """[Conducts the web scraping process]
    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = pd.DataFrame()
    
    # extract the job data
    while True:
        response = requests.get(get_URL(position, location))
        soup = BeautifulSoup(response.text, 'html.parser')
        searchResults = soup.find('div', id='mosaic-provider-jobcards')
        refinedsearchResults = searchResults.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
        
        raw_posts = []
        for post in refinedsearchResults:
            raw_posts.append(post)
        
        for post in raw_posts:
            datapoint = get_features(post)
            data = data.append(datapoint, ignore_index=True)
        
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
            print(' new page')
        except AttributeError:
            break

        

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    # save the job data
    data.to_csv(f'../app/data/scraped_{name}_{loc}.csv', index=False)
    
    return f'Scraped {len(data)} new records.'

In [6]:
state_names = [ "alabama", "arkansas",  "arizona",  "colorado", "connecticut", "delaware",  "georgia", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada",  "ohio", "oklahoma", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee", "utah", "virginia",  "vermont",  "wisconsin", "west virginia", "wyoming"]


for state in state_names:
    position = 'data scientist'
    location = state
    print(state)
    data = main(position,location )
    print(len(data))

alabama
23
arkansas
22
arizona
23
colorado
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
24
connecticut
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
24
delaware
22
georgia
 new page
 new page
 new page
23
iowa
22
idaho
22
illinois
 new page
 new page
23
indiana
22
kansas
22
kentucky
22
louisiana
22
maryland
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
 new page
24
maine
22
michigan
22
minnesota
 new page
 new page
 new page
 new page
23
missouri
22
mississippi
22
montana
22
north carolina
 new page
 new page
 new page
 new page
23
north dakota
22
nebraska
22
new hampshire
22
new jersey
 new page
 new page
 new page
 new page
 ne

In [7]:
position = 'data scientist'
location = 'california'
main(position,location )

 new page
 new page


'Scraped 45 new records.'

In [8]:
position = 'data scientist'
location = 'remote'
main(position,location )

 new page
 new page
 new page
 new page


ConnectionError: HTTPSConnectionPool(host='www.indeed.com', port=443): Max retries exceeded with url: /rc/clk?jk=7d3b67195e4f01f2&fccid=dc023faa36b88bf2&vjs=3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fd678d28e20>: Failed to establish a new connection: [Errno 110] Connection timed out'))

In [None]:
position = 'data scientist'
location = 'new york'
main(position,location )

In [None]:
position = 'data scientist'
location = 'texas'
main(position,location )

In [None]:
position = 'data scientist'
location = 'washington'
main(position,location )

In [None]:
position = 'data scientist'
location = 'florida'
main(position,location )

In [None]:
position = 'data scientist'
location = 'massachusetts'
main(position,location )

In [None]:
position = 'data scientist'
location = 'oregon'
main(position,location )

The  below is used for various adjustments to my webscraping process.

### Concatenating Data With New

In [None]:
state_names = ['alabama', 'arkansas', 'arizona', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'remote', 'iowa', 'idaho', 'illinois', 'indiana', 'kansas', 'kentucky', 'louisiana', 'massachusetts', 'maryland', 'maine', 'michigan', 'minnesota', 'missouri', 'mississippi', 'montana', 'north carolina', 'north dakota', 'nebraska', 'new hampshire', 'new jersey', 'new mexico', 'nevada', 'new york', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina', 'south dakota', 'tennessee', 'texas', 'utah', 'virginia',  'vermont', 'washington', 'wisconsin', 'west virginia', 'wyoming']

lst = []
records = 0
for i in state_names:
    i = i.replace(' ','_')
    try:
        location = pd.read_csv(f'../app/data/scraped_data_scientist_{i}.csv')
        records += len(location)
        lst.append(location)
        print(f'Scraped  {len(location)} new records for {i}')
    except:
        continue

print(f'Scraped Records: {(records)}')

#todays_scrape = pd.concat(lst)
## unblock if you miss too many days
#total = todays_scrape.to_csv('../app/data/total.csv', index= False)

total = pd.read_csv(f'../app/data/total.csv')
lst.append(total)

total = pd.concat(lst)

total.drop_duplicates(inplace=True)

total.to_csv('../app/data/total.csv', index= False)
print(f'Total Records: {(len(total))}')
total

In [None]:
#TODO explain why this is being shown. remove from data and save, but also show what portion of the data it represents. Relatively miniscule.
total[total.description.isna()].location.value_counts()

The below are bits of code I employ if something goes wrong with the webscraping process.

In [None]:
"""#fix old imports

data['extractDate']= pd.to_datetime(data['extractDate'])

def pDate(row):
    from datetime import datetime, date, timedelta

    #days_ago = row['dateposted']
    delta = timedelta(0)
    try:
        return row['extractDate'] - delta
    except:
        return row

data['extractDate'] = data.apply( lambda row : pDate(row), axis = 1)
data['extractDate'] = data['extractDate'].astype(str)
#data.to_csv('../app/data/scraped_data_scientist_remote_2022-04-14.csv', index= False)
data.extractDate.unique()"""

In [None]:
"""# codescraps in case they change the html and break my parsers

searchResults = soup.find('div', id='mosaic-provider-jobcards')
refinedsearchResults = searchResults.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
len(refinedsearchResults)
z = searchResults.children
lst = []
for i in z:
    lst.append(i)

x = lst[0]

#checking again
soup = BeautifulSoup(response.text, 'html.parser')
#searchResults = soup.find('div', id='mosaic-provider-jobcards')
refinedsearchResults = soup.find_all('div', attrs={'class': lambda e: e.startswith('cardOutline') if e else False})
        

raw_posts = []
for post in refinedsearchResults:
        raw_posts.append(post)
        n = 0

z = raw_posts[0]
url = z.find('a', href=True)
url


z.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href']
z

postDate = z.find('span', 'date').text
extractDate = datetime.today().strftime('%Y-%m-%d')
summary = z.find('div', 'job-snippet').text.strip().replace('\n', ' ')

summary

company_name = z.find('a', attrs={'class':'turnstileLink companyOverviewLink'}).text.strip()
company_name

job_title = z.find('a', attrs={'class':'jcs-JobTitle'}).text.strip()
job_title

"""