In [1]:
import csv
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from random import random
import time

import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")

In [2]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    #from torrequest import TorRequest
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]
    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]
    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=1&sort=date'
                
    position = position.replace(' ', '%20')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url


# from torrequest import TorRequest
# tr=TorRequest(password='your_super_secure_password')
position = 'data scientist'
location = 'iowa'
# tr.reset_identity()
response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
#response.text

In [3]:
def get_desc_features(job_url):
    response_job_desc = requests.get(job_url)
    soup = BeautifulSoup(response_job_desc.text, 'html.parser')

    
    try:
        salary_and_jType = soup.find('div', id='salaryInfoAndJobType').text.strip()
    except:
        salary_and_jType = None
    if salary_and_jType == None:
        try:
            salary_and_jType = soup.find('div',id="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            salary_and_jType = None
    #TODO get benefits from its designated section
    
    
    try:
        sal_guide_items = []
        items = soup.find('ul',class_='css-1lyr5hv eu4oa1w0')
        for i in items:
            sal_guide_items.append(i.text)
    except:
        sal_guide_items = None
        
        
    try:
        salfromsection = soup.find('span',class_='icl-u-xs-mr--xs').text
    except:
        salfromsection = None
        
        
    try:
        job_type_items = []
        job_type_from_section = soup.find('div',class_='jobsearch-JobDescriptionSection-sectionItem').next_sibling.children
        for i in job_type_from_section:
            if i.text == 'Job Type':
                continue
            else:
                job_type_items.append(i.text)
    except:
        job_type_items = None
        
    
    try:
        requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()      

    except:
        requirements = None
        
        
    try:
        description = soup.find(id="jobDescriptionText").text.replace('\n', '')
    except:
        description = None
        
        
    # A nifty little workaround for evading detection.
    time.sleep(.3+random()*3)
    #TODO assess h2 tags commonalities to determine if these section descriptions are from Indeed or are at least of only a few variations.
        #you could then distinguish the description into sections and conduct NLP etc each.
    raw_desc_soup = soup
    return salary_and_jType, sal_guide_items, salfromsection, job_type_items, requirements, description, raw_desc_soup
    





In [4]:
#TODO condense these with lists, particularly fields that have .text.strip()
def get_features(post):
    datapoint_dict = {}

    title = post.find('h2',
              attrs={'class': lambda e: e.startswith('jobTitle') if e else False}).text.replace('new', '')

    company = post.find('span', 'companyName').text.strip()
    try:
        rating = post.find('span', 'ratingNumber').text
    except:
        rating = None

    location = post.find('div', 'companyLocation').text.strip()

    postDate = post.find('span', 'date').text

    extractDate = datetime.today().strftime('%Y-%m-%d')

    summary = post.find('div', 'job-snippet').text.strip().replace('\n', ' ')

    url = 'https://www.indeed.com'+post.get('href')

    try:
        estimated_salary = post.find('span','estimated-salary').text.strip()
    except:
        estimated_salary = None
    try:
        salary = post.find('div','metadata salary-snippet-container').text.strip()
    except:
        salary = None


        
            
    salary_and_jType, sal_guide_items, salfromsection, job_type_items, requirements, description, raw_desc_soup = get_desc_features(url)
    datapoint_dict = {
                        'title':title,
                        'company':company,
                        'rating':rating,
                        'location':location,
                        'salary':salary,
                        'estimated_salary':estimated_salary,
                        'postDate':postDate,
                        'extractDate':extractDate,
                        'summary':summary,
                        'url':url,
                        'salary_and_jType':salary_and_jType,
                        'sal_guide_items':sal_guide_items,
                        'salfromsection':salfromsection,
                        'job_type_items':job_type_items,
                        'requirements':requirements,
                        'description':description,
                        'raw_desc_soup':raw_desc_soup}
    if len(datapoint_dict) > 0:
        return datapoint_dict
    else:
        pass

In [5]:
def main(position, location):
    """[Conducts the web scraping process]
    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = pd.DataFrame()
    
    # extract the job data
    while True:
        response = requests.get(get_URL(position, location))
        soup = BeautifulSoup(response.text, 'html.parser')
        searchResults = soup.find('div', id='mosaic-provider-jobcards')
        refinedsearchResults = searchResults.find_all('a', attrs={'class': lambda e: e.startswith('tapItem') if e else False})
        

        raw_posts = []
        for post in refinedsearchResults:
            raw_posts.append(post)
        
        n = 0
        for post in raw_posts:
            datapoint = get_features(post)
            data = data.append(datapoint, ignore_index=True)
        # Again, a nifty little workaround for evading detection.
            n+=1
            print(n)
            
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except:
            break

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    day = date.today()
    # save the job data
    data.to_csv(f'../app/data/scraped_{name}_{loc}_{day}.csv', index=False)
    return data

In [6]:
position = 'data scientist'
location = 'remote'
data = main(position,location )

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [None]:
data

In [None]:
name = position.replace(' ','_')
loc = location.replace(' ','_')
day = date.today()
data.to_csv(f'../app/data/scraped_{name}_{loc}_{day}.csv', index=False)

## Tor as needed

In [None]:
# Use this to use tor after activating it in terminal when needed.
import socks
import socket
socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.7", port=9050)
#print(tr.get("http://icanhazip.com").text)

### Concatenating Old Data With New

In [8]:
a = pd.read_csv(f'../app/data/scraped_data_scientist_remote_2022-04-13.csv')
b = pd.read_csv(f'../app/data/scraped_data_scientist_remote_2022-04-14.csv')
c = pd.read_csv(f'../app/data/scraped_data_scientist_remote_2022-04-15.csv')


In [9]:
#total = pd.read_csv(f'../app/data/total.csv')

z = pd.concat([a,b,c,])
z.to_csv('../app/data/total.csv', index= False)

In [None]:
data.to_csv('../app/data/total.csv', index= False)