In [1]:
import csv
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from random import random
import time

# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    #from torrequest import TorRequest
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]
    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]
    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=2&sort=date'
                
    position = position.replace(' ', '%20')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url


# from torrequest import TorRequest
# tr=TorRequest(password='your_super_secure_password')
position = 'data scientist'
location = 'iowa'
# tr.reset_identity()
response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
#response.text

In [3]:
def get_desc_features(job_url):
    """Parses each job description, searching for and extracting values for features

    Args:
        job_url (string): http address of each job posting

    Returns:
        tuple: job feature values
    """
    response_job_desc = requests.get(job_url)
    soup = BeautifulSoup(response_job_desc.text, 'html.parser')

    
    try:
        salary_and_jType = soup.find('div', id='salaryInfoAndJobType').text.strip()
    except:
        salary_and_jType = None
    if salary_and_jType == None:
        try:
            salary_and_jType = soup.find('div',id="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            salary_and_jType = None
    #TODO get benefits from its designated section
    
    
    try:
        sal_guide_items = []
        items = soup.find('ul',class_='css-1lyr5hv eu4oa1w0')
        for i in items:
            sal_guide_items.append(i.text)
    except:
        sal_guide_items = None
        
        
    try:
        salfromsection = soup.find('span',class_='icl-u-xs-mr--xs').text
    except:
        salfromsection = None
        
        
    try:
        job_type_items = []
        job_type_from_section = soup.find('div',class_='jobsearch-JobDescriptionSection-sectionItem').next_sibling.children
        for i in job_type_from_section:
            if i.text == 'Job Type':
                continue
            else:
                job_type_items.append(i.text)
    except:
        job_type_items = None
        
    
    try:
        requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()      

    except:
        requirements = None
        
        
    try:
        description = soup.find(id="jobDescriptionText").text.replace('\n', '')
    except:
        description = None
        
        
    # A nifty little workaround for evading detection.
    time.sleep(.3+random()*3)
    #TODO assess h2 tags commonalities to determine if these section descriptions are from Indeed or are at least of only a few variations.
        #you could then distinguish the description into sections and conduct NLP etc each.
    raw_desc_soup = soup
    return salary_and_jType, sal_guide_items, salfromsection, job_type_items, requirements, description, raw_desc_soup

In [4]:
#TODO condense these with lists, particularly fields that have .text.strip()
def get_features(post):
    """parses search results and extracts basic job feature values,
        then combines this with output of 'get_desc_features' function.

    Args:
        post (string): response for each post in search results page

    Returns:
        dict: single-feature deep dictionary of features (dictionary keys) and their values (dictionary values)
    """
    datapoint_dict = {}

    title = post.find('h2',
              attrs={'class': lambda e: e.startswith('jobTitle') if e else False}).text.replace('new', '')

    company = post.find('span', 'companyName').text.strip()
    try:
        rating = post.find('span', 'ratingNumber').text
    except:
        rating = None

    location = post.find('div', 'companyLocation').text.strip()

    postDate = post.find('span', 'date').text

    extractDate = datetime.today().strftime('%Y-%m-%d')

    summary = post.find('div', 'job-snippet').text.strip().replace('\n', ' ')

    url = 'https://www.indeed.com'+post.get('href')

    try:
        estimated_salary = post.find('span','estimated-salary').text.strip()
    except:
        estimated_salary = None
    try:
        salary = post.find('div','metadata salary-snippet-container').text.strip()
    except:
        salary = None


        
            
    salary_and_jType, sal_guide_items, salfromsection, job_type_items, requirements, description, raw_desc_soup = get_desc_features(url)
    datapoint_dict = {
                        'title':title,
                        'company':company,
                        'rating':rating,
                        'location':location,
                        'salary':salary,
                        'estimated_salary':estimated_salary,
                        'postDate':postDate,
                        'extractDate':extractDate,
                        'summary':summary,
                        'url':url,
                        'salary_and_jType':salary_and_jType,
                        'sal_guide_items':sal_guide_items,
                        'salfromsection':salfromsection,
                        'job_type_items':job_type_items,
                        'requirements':requirements,
                        'description':description,
                        'raw_desc_soup':raw_desc_soup}
    if len(datapoint_dict) > 0:
        return datapoint_dict
    else:
        pass

In [5]:
def main(position, location):
    """[Conducts the web scraping process]
    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = pd.DataFrame()
    
    # extract the job data
    while True:
        response = requests.get(get_URL(position, location))
        soup = BeautifulSoup(response.text, 'html.parser')
        searchResults = soup.find('div', id='mosaic-provider-jobcards')
        refinedsearchResults = searchResults.find_all('a', attrs={'class': lambda e: e.startswith('tapItem') if e else False})
        

        raw_posts = []
        for post in refinedsearchResults:
            raw_posts.append(post)
        
        n = 0
        for post in raw_posts:
            datapoint = get_features(post)
            data = data.append(datapoint, ignore_index=True)
        # Again, a nifty little workaround for evading detection.
            n+=1
            print(n)
            
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except:
            break

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    day = date.today()
    # save the job data
    data.to_csv(f'../app/data/scraped_{name}_{loc}_{day}.csv', index=False)
    return data

In [6]:
position = 'data scientist'
location = 'remote'
data = main(position,location )

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [7]:
data

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Target,"Location: 1000 Nicollet Mall, Minneapolis, Min...",,2022-04-19,,"Remote in Minneapolis, MN 55403",PostedJust posted,3.6,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,,,,,REQUIREMENTS: This position requires a Master’...,Senior Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,Envision,The successful candidate will use the latest i...,Estimated $127K - $161K a year,2022-04-19,,+1 locationRemote,PostedJust posted,3.7,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,"[, Not provided by employer, $127K - $161K a y...",,,,You have deep understating of images from diff...,Remote Sensing Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,Northrop Grumman,Requisition ID: R10041053Category: Research an...,,2022-04-19,[Full-time],Remote in Virginia,PostedJust posted,4.0,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,,"$125,800 - $234,000 a year","$125,800 - $234,000 a year - Full-time","$125,800 - $234,000 a year",Experience with Enterprise data analytics. Exc...,Sr. Principal / Staff Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
3,UnitedHealth Group,Combine two of the fastest-growing fields on t...,,2022-04-19,,"Remote in Chicago, IL 60695",PostedJust posted,3.6,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,,,,,7+ years of hands-on experience in developing ...,Director Data Science - Telecommute,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
4,FIIDUS,******* Remote Job but once in a while travel ...,Estimated $82K - $104K a year,2022-04-19,,Remote,PostedJust posted,,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,"[, Not provided by employer, $82K - $104K a ye...",,"Part-time, Contract",,"Advanced data modeling tools, Auto ML, Python ...",Data Scientist,https://www.indeed.com/company/FIIDUS/jobs/Dat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Holland America Line,Job Description Welcome to Holland America Lin...,,2022-04-19,,"Remote in Miami, FL",PostedJust posted,4.0,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,"[, Not provided by employer, $74.6K to $94.4K ...",,Full-time,,Experience on machine learning and data modell...,"Data Scientist, Marketing Analytics and Campai...",https://www.indeed.com/rc/clk?jk=5e3f1dafd72a9...
326,Iterable,Iterable is a cross-channel platform that powe...,,2022-04-19,,"Remote in San Francisco, CA",PostedJust posted,,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,,"$191,100 a year","$191,100 a year","$191,100 a year",Work closely with the infrastructure engineeri...,Staff Data Engineer - Machine Learning,https://www.indeed.com/rc/clk?jk=3ee0703d8a1e1...
327,"Avetta, LLC",Avetta connects the world's leading organizati...,,2022-04-19,,"Remote in Houston, TX",PostedJust posted,,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,"[, Not provided by employer, $115K - $145K a y...",,,,Mine company data to drive customer facing ana...,Data Scientist,https://www.indeed.com/rc/clk?jk=1634a20c3932e...
328,Etsy,Company Description Etsy is the global mar...,,2022-04-19,,"Remote in Seattle, WA 98101",PostedJust posted,4.3,"[html, \n, [\n, [\n, <script crossorigin=""anon...",,"[, Not provided by employer, $147K to $186K pe...",,Full-time,,You have familiarity with machine learning and...,Senior Applied Scientist II-Knowledge Base,https://www.indeed.com/rc/clk?jk=ce2571b93c5e9...


In [8]:
name = position.replace(' ','_')
loc = location.replace(' ','_')
day = date.today()
data.to_csv(f'../app/data/scraped_{name}_{loc}_{day}.csv', index=False)

The  below is used for various adjustments to my webscraping process.

### Concatenating Old Data With New

In [5]:
a = pd.read_csv(f'../app/data/scraped_data_scientist_remote_2022-04-20.csv')
total = pd.read_csv(f'../app/data/total.csv')

In [6]:
#total = pd.read_csv(f'../app/data/total.csv')

z = pd.concat([total,a])
z.to_csv('../app/data/total.csv', index= False)

In [7]:
z.extractDate.unique()

array(['2022-04-13', '2022-04-14', '2022-04-15', '2022-04-17',
       '2022-04-19', '2022-04-20'], dtype=object)

In [8]:
z

Unnamed: 0,company,description,estimated_salary,extractDate,job_type_items,location,postDate,rating,raw_desc_soup,requirements,sal_guide_items,salary,salary_and_jType,salfromsection,summary,title,url
0,Online Technical Services,"Job descriptionData Scientist, MarketingSAN DI...",,2022-04-13,['Full-time'],Remote,PostedJust posted,3.7,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Master's (Preferred)Python: 1 year (Preferred)...,,"$145,000 - $150,000 a year","$145,000 - $150,000 a year - Full-time","$145,000 - $150,000 a year",Identify relevant data sources and data sets t...,Data Scientist - Marketing,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,West CAP,HUMAN was founded in 2012 in a Brooklyn sci-fi...,Estimated $114K – $144K a year,2022-04-13,,"Remote in New York, NY+2 locations",PostedJust posted,3.5,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$114K to $14...",,Full-time,,You’ve worked as a data scientist solving larg...,"Data Scientist, BotGuard",https://www.indeed.com/rc/clk?jk=58cdde046f643...
2,Maya Ai inc.,Our Maya team is expanding and we are looking ...,,2022-04-13,"['Full-time', 'Part-time']",Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",Python: 1 year (Preferred)SQL: 1 year (Preferred),,"$77,766 - $183,411 a year","$77,766 - $183,411 a year - Full-time, Part-time","$77,766 - $183,411 a year",Our Analyst will be dealing with data coming i...,Data Scientist,https://www.indeed.com/company/Maya-Ai-inc./jo...
3,"EMERGETECH, INC",Description:Job CategoryData ScienceAbout Emer...,Estimated $94.7K – $120K a year,2022-04-13,,Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$94.7K to $1...",,,,Design and create the data sources that ”citiz...,Data Scientist,https://www.indeed.com/rc/clk?jk=95fb128bb025f...
4,Recurrent,What's the opportunity?Recurrent is on a missi...,Estimated $119K – $151K a year,2022-04-13,,"Remote in Seattle, WA",PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$119K to $15...",,,,Experienced - you have 2+ years of experience ...,Data Scientist,https://www.indeed.com/rc/clk?jk=e9ce610b72deb...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,Lucid Technologies Inc,Job Title: Information Security Data Scientist...,Estimated $95.4K - $121K a year,2022-04-20,,"Remote in Dallas, TX",PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$95.4K - $12...",,,,Participate as a key member of the analytics a...,Information Security Data Scientist (Remote),https://www.indeed.com/rc/clk?jk=6202e64a38d75...
26,Windstream Communications,We are looking for a data scientist to help us...,,2022-04-20,,Remote,PostedJust posted,3.2,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,,,Full-time,,You will also be responsible for acquisition o...,Senior Data Scientist,https://www.indeed.com/rc/clk?jk=5362358221574...
27,Edmunds.com,At Edmunds we’re driven to make car buying eas...,Estimated $47.5K - $60.1K a year,2022-04-20,,"Remote in Santa Monica, CA 90404",PostedJust posted,3.4,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$47.5K to $6...",,Full-time,,The senior analyst/data scientist will serve a...,Senior Analyst/Data Scientist,https://www.indeed.com/rc/clk?jk=9dcaee4b15174...
28,One Concern,About One Concern One Concern brings disaster ...,Estimated $118K - $150K a year,2022-04-20,,Remote,PostedJust posted,,"<!DOCTYPE html>\n\n<html dir=""ltr"" lang=""en"">\...",,"['', 'Not provided by employer', ""$118K - $150...",,Full-time,,Collaborate with an interdisciplinary team of ...,"Senior Data Scientist, Wind Resilience",https://www.indeed.com/rc/clk?jk=22dd09644fe6b...


In [21]:
#fix old imports

data['extractDate']= pd.to_datetime(data['extractDate'])

def pDate(row):
    from datetime import datetime, date, timedelta

    #days_ago = row['dateposted']
    delta = timedelta(0)
    try:
        return row['extractDate'] - delta
    except:
        return row

data['extractDate'] = data.apply( lambda row : pDate(row), axis = 1)
data['extractDate'] = data['extractDate'].astype(str)

In [51]:
data.to_csv('../app/data/scraped_data_scientist_remote_2022-04-14.csv', index= False)

In [17]:
data.extractDate.unique()

array(['2022-04-20'], dtype=object)