In [3]:
import csv
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from random import random
import time

import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")


def get_URL(position, location):
    """[Build a template url]
    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]

    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=3&sort=date'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_features(post):
    title = post.find('h2',
              attrs={'class': lambda e: e.startswith('jobTitle') if e else False}).text.replace('new', '')
    company = post.find('span', 'companyName').text.strip()
    location = post.find('div', 'companyLocation').text.strip()
    postDate = post.find('span', 'date').text
    extractDate = datetime.today().strftime('%Y-%m-%d')
    summary = post.find('div', 'job-snippet').text.strip().replace('\n', ' ')
    url = 'https://www.indeed.com' + post.get('href')

    salary_tag = post.find('span', 'salary-snippet')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''
        
    def job_description(job_url):
        response_jobDesc = requests.get(job_url)
        soup = BeautifulSoup(response_jobDesc.text, 'html.parser')
        # https://stackoverflow.com/questions/63231164/indeed-web-scraping-python-selenium-beautifulsoup
        try:
            requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            requirements = 'None'
        try:
            description = soup.find(id="jobDescriptionText").text.replace('\n', '')
        except:
            description = 'None'
        # A nifty little workaround for evading detection.
        #time.sleep(1+random()*2)
        return requirements, description
    
    requirements, description = job_description(url)
    
    datapoint = {'JobTitle':title,'Company':company, 'Location':location, 'Salary':salary, 'ExtractDate':extractDate, 'PostDate':postDate, 'Requirements':requirements, 'Summary':summary, 'Description': description, 'JobUrl':url}
    return datapoint


def main(position, location):
    """[Conducts the web scraping process]
    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = pd.DataFrame(columns = ['JobTitle', 'Company', 'Location', 'Salary', 'ExtractDate', 'PostDate', 'ExtractDate', 'Requirements','Summary', 'Description', 'JobUrl'])
    url = get_URL(position, location)

    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        searchResults = soup.find_all('div', 'slider_container' )
        
        raw_posts = []
        for post in searchResults:
            raw_posts.append(post.parent)
            
        for post in raw_posts:
            datapoint = get_features(post)
            data = data.append(datapoint, ignore_index=True)

        # Again, a nifty little workaround for evading detection.
            time.sleep(1.5+random()*5)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    day = date.today()
    # save the job data
    data.to_csv(f'../app/data/scraped_{name}_{loc}_{day}.csv', index=False)

In [4]:
l = ''
main('("data scientist" or "data science")', l  )

ConnectionError: HTTPSConnectionPool(host='us.conv.indeed.com', port=443): Max retries exceeded with url: /rc/clk?jk=c028ef32cfac462a&ctk=1fbtt7aflpi9u802&t=cr&rctype=oth&orgclktk=1fbtt7ag2pi9u800&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f9e6c29e3d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [1]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    from torrequest import TorRequest
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]

    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]

    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=3&sort=date'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url
from torrequest import TorRequest
tr=TorRequest(password='your_super_secure_password')
position = '("data scientist" or "data science")'
location = ''
tr.reset_identity()
response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
response.text


OSError: reached a 90 second timeout without success

### Tor as needed

In [4]:
# Use this to use tor after activating it in terminal when needed.
import socks
import socket
socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.7", port=9050)
#print(tr.get("http://icanhazip.com").text)

### Concatenating Old Data With New

In [5]:
a = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")__2021-07-08.csv')
a = a.drop(columns=['ExtractDate.1'])
a = a.rename(columns={'Salary':'Pay'})

b = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")__2021-07-10.csv')
b = b.drop(columns=['ExtractDate.1'])
b = b.rename(columns={'Salary':'Pay'})

c = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")__2021-07-21.csv')
c = c.drop(columns=['ExtractDate.1'])
c = c.rename(columns={'Salary':'Pay'})

d = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_New_York_2021-07-21.csv')
d = d.drop(columns=['ExtractDate.1'])
d = d.rename(columns={'Salary':'Pay'})

e = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_California_2021-07-21.csv')
e = e.drop(columns=['ExtractDate.1'])
e = e.rename(columns={'Salary':'Pay'})

f = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Alabama_2021-07-21.csv')
f = f.drop(columns=['ExtractDate.1'])
f = f.rename(columns={'Salary':'Pay'})

g = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Alaska_2021-07-21.csv')
g = g.drop(columns=['ExtractDate.1'])
g = g.rename(columns={'Salary':'Pay'})

h = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Arizona_2021-07-21.csv')
h = h.drop(columns=['ExtractDate.1'])
h = h.rename(columns={'Salary':'Pay'})

i = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Arkansas_2021-07-22.csv')
i = i.drop(columns=['ExtractDate.1'])
i = i.rename(columns={'Salary':'Pay'})

j = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Colorado_2021-07-22.csv')
j = j.drop(columns=['ExtractDate.1'])
j = j.rename(columns={'Salary':'Pay'})

k = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Connecticut_2021-07-22.csv')
k = k.drop(columns=['ExtractDate.1'])
k = k.rename(columns={'Salary':'Pay'})

l = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Delaware_2021-07-22.csv')
l = l.drop(columns=['ExtractDate.1'])
l = l.rename(columns={'Salary':'Pay'})

m = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Florida_2021-07-22.csv')
m = m.drop(columns=['ExtractDate.1'])
m = m.rename(columns={'Salary':'Pay'})

n = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Georgia_2021-07-22.csv')
n = n.drop(columns=['ExtractDate.1'])
n = n.rename(columns={'Salary':'Pay'})

o = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Hawaii_2021-07-22.csv')
o = o.drop(columns=['ExtractDate.1'])
o = o.rename(columns={'Salary':'Pay'})

p = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Idaho_2021-07-22.csv')
p = p.drop(columns=['ExtractDate.1'])
p = p.rename(columns={'Salary':'Pay'})

q = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Illinois_2021-07-22.csv')
q = q.drop(columns=['ExtractDate.1'])
q = q.rename(columns={'Salary':'Pay'})

r = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Indiana_2021-07-23.csv')
r = r.drop(columns=['ExtractDate.1'])
r = r.rename(columns={'Salary':'Pay'})

s = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Iowa_2021-07-23.csv')
s = s.drop(columns=['ExtractDate.1'])
s = s.rename(columns={'Salary':'Pay'})

t = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Kansas_2021-07-23.csv')
t = t.drop(columns=['ExtractDate.1'])
t = t.rename(columns={'Salary':'Pay'})

u = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Kentucky_2021-07-23.csv')
u = u.drop(columns=['ExtractDate.1'])
u = u.rename(columns={'Salary':'Pay'})

w = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Louisiana_2021-07-23.csv')
w = w.drop(columns=['ExtractDate.1'])
w = w.rename(columns={'Salary':'Pay'})

v = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Maine_2021-07-23.csv')
v = v.drop(columns=['ExtractDate.1'])
v = v.rename(columns={'Salary':'Pay'})

x = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Maryland_2021-07-23.csv')
x = x.drop(columns=['ExtractDate.1'])
x = x.rename(columns={'Salary':'Pay'})

y = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Massachusetts_2021-07-23.csv')
y = y.drop(columns=['ExtractDate.1'])
y = y.rename(columns={'Salary':'Pay'})

z = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Michigan_2021-07-23.csv')
z = z.drop(columns=['ExtractDate.1'])
z = z.rename(columns={'Salary':'Pay'})


za = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Minnesota_2021-07-23.csv')
za = za.drop(columns=['ExtractDate.1'])
za = za.rename(columns={'Salary':'Pay'})

zb = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Mississippi_2021-07-23.csv')
zb = zb.drop(columns=['ExtractDate.1'])
zb = zb.rename(columns={'Salary':'Pay'})

zc = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Missouri_2021-07-23.csv')
zc = zc.drop(columns=['ExtractDate.1'])
zc = zc.rename(columns={'Salary':'Pay'})

zd = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Montana_2021-07-23.csv')
zd = zd.drop(columns=['ExtractDate.1'])
zd = zd.rename(columns={'Salary':'Pay'})

ze = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Nebraska_2021-07-23.csv')
ze = ze.drop(columns=['ExtractDate.1'])
ze = ze.rename(columns={'Salary':'Pay'})

zf = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Nevada_2021-07-23.csv')
zf = zf.drop(columns=['ExtractDate.1'])
zf = zf.rename(columns={'Salary':'Pay'})

zg = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_New_Hampshire_2021-07-23.csv')
zg = zg.drop(columns=['ExtractDate.1'])
zg = zg.rename(columns={'Salary':'Pay'})

zh = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_New_Jersey_2021-07-23.csv')
zh = zh.drop(columns=['ExtractDate.1'])
zh = zh.rename(columns={'Salary':'Pay'})

zi = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_New_Mexico_2021-07-23.csv')
zi = zi.drop(columns=['ExtractDate.1'])
zi = zi.rename(columns={'Salary':'Pay'})

zj = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_North_Carolina_2021-07-23.csv')
zj = zj.drop(columns=['ExtractDate.1'])
zj = zj.rename(columns={'Salary':'Pay'})

zk = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_North_Dakota_2021-07-23.csv')
zk = zk.drop(columns=['ExtractDate.1'])
zk = zk.rename(columns={'Salary':'Pay'})

zl = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Ohio_2021-07-23.csv')
zl = zl.drop(columns=['ExtractDate.1'])
zl = zl.rename(columns={'Salary':'Pay'})

zm = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Oklahoma_2021-07-24.csv')
zm = zm.drop(columns=['ExtractDate.1'])
zm = zm.rename(columns={'Salary':'Pay'})

zn = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Oregon_2021-07-24.csv')
zn = zn.drop(columns=['ExtractDate.1'])
zn = zn.rename(columns={'Salary':'Pay'})

zo = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Pennsylvania_2021-07-24.csv')
zo = zo.drop(columns=['ExtractDate.1'])
zo = zo.rename(columns={'Salary':'Pay'})

zp = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Rhode_Island_2021-07-24.csv')
zp = zp.drop(columns=['ExtractDate.1'])
zp = zp.rename(columns={'Salary':'Pay'})

zq = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_South_Carolina_2021-07-24.csv')
zq = zq.drop(columns=['ExtractDate.1'])
zq = zq.rename(columns={'Salary':'Pay'})

zr = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_South_Dakota_2021-07-24.csv')
zr = zr.drop(columns=['ExtractDate.1'])
zr = zr.rename(columns={'Salary':'Pay'})

zs = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Tennessee_2021-07-24.csv')
zs = zs.drop(columns=['ExtractDate.1'])
zs = zs.rename(columns={'Salary':'Pay'})

zt = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Utah_2021-07-25.csv')
zt = zt.drop(columns=['ExtractDate.1'])
zt = zt.rename(columns={'Salary':'Pay'})

zu = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Vermont_2021-07-25.csv')
zu = zu.drop(columns=['ExtractDate.1'])
zu = zu.rename(columns={'Salary':'Pay'})

zv = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Virginia_2021-07-25.csv')
zv = zv.drop(columns=['ExtractDate.1'])
zv = zv.rename(columns={'Salary':'Pay'})

zw = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Washington_2021-07-25.csv')
zw = zw.drop(columns=['ExtractDate.1'])
zw = zw.rename(columns={'Salary':'Pay'})

zx = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_West_Virginia_2021-07-25.csv')
zx = zx.drop(columns=['ExtractDate.1'])
zx = zx.rename(columns={'Salary':'Pay'})

zy = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Wisconsin_2021-07-25.csv')
zy = zy.drop(columns=['ExtractDate.1'])
zy = zy.rename(columns={'Salary':'Pay'})

zz = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Wyoming_2021-07-25.csv')
zz = zz.drop(columns=['ExtractDate.1'])
zz = zz.rename(columns={'Salary':'Pay'})

ya = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")__2021-07-25.csv')
ya = ya.drop(columns=['ExtractDate.1'])
ya = ya.rename(columns={'Salary':'Pay'})

yb = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")_Remote_2021-07-25.csv')
yb = yb.drop(columns=['ExtractDate.1'])
yb = yb.rename(columns={'Salary':'Pay'})

yc = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")__2021-07-26.csv')
yc = yc.drop(columns=['ExtractDate.1'])
yc = yc.rename(columns={'Salary':'Pay'})

yd = pd.read_csv(f'../app/data/scraped_("data_scientist"_or_"data_science")__2021-07-28.csv')
yd = yd.drop(columns=['ExtractDate.1'])
yd = yd.rename(columns={'Salary':'Pay'})



In [7]:
total = pd.read_csv(f'../app/data/total.csv')
z = pd.concat([total,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,za,zb,zc,zd,ze,zf,zg,zh,zi,zj,zk,zl,zm,zn,zo,zp,zq,zr,zs,zt,zu,zv,zw,zx,zy,zz,ya,yb,yc,yd])
z.to_csv('../app/data/total.csv', index=False)

In [8]:
len(z)

28676