In [223]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

# Exploring indeed.com page elements

In [192]:
def get_url(position, location):
  """Generate a url  from position and location """
  template = "https://www.indeed.com/jobs?q={}&l={}"
  url = template.format(position, location)
  return url

In [193]:
url = get_url('data scientist', 'bentonville ar')

In [194]:
response = requests.get(url)

In [195]:
response

<Response [200]>

In [196]:
response.reason

''

In [197]:
soup = BeautifulSoup(response.text, 'html.parser')

In [198]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [199]:
len(cards)

15

In [200]:
card = cards[0]

In [201]:
atag = card.h2.a

In [202]:
job_title = atag.get('title')

In [203]:
job_url = 'https://www.indeed.com' + atag.get('href')

In [204]:
company = card.find('span', 'company').text.strip()

In [205]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')

In [206]:
summary = card.find('div', 'summary').text.strip()

In [207]:
date = card.find('span', 'date').text

In [208]:
today = datetime.today().strftime('%Y-%m-%d')

In [209]:
try: 
  card.find('span', 'salarText').text.strip()
except AttributeError:
  job_salary = '' 

# Generalizing the model

In [210]:
def get_record(card):
  atag = card.h2.a
  job_title = atag.get('title')
  job_url = 'https://www.indeed.com' + atag.get('href')
  company = card.find('span', 'company').text.strip()
  job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
  job_summary = card.find('div', 'summary').text.strip()
  post_date = card.find('span', 'date').text
  today = datetime.today().strftime('%Y-%m-%d')
  try: 
    card.find('span', 'salarText').text.strip()
  except AttributeError:
    job_salary = '' 
  
  record = {job_title, company, job_location, post_date, today, job_summary, job_salary, job_url}

  return record

In [211]:
records = []

for card in cards: 
  record = get_record(card)
  records.append(record)

In [212]:
records[0]

{'',
 '2021-03-26',
 '30+ days ago',
 'A successful candidate will manage data science projects including scoping, designing, and executing the vision with the help of more junior data scientists /…',
 'Lead Data Scientist',
 'TRANZACT',
 'United States',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0DmvEs89GDjygIsDT0FtjWiil-qnA5TS0Npqc8I6T5HHPj1qJcB6ts2RC9U8mqwlH31VRyGgFTjuk6MDSQy9dlxHVsLO1FyLeaovTNDU0XCRmuZoptVyQ-71WeG21w73lkklPyJrRMYwYxr_-SpOcX4VcXq5o9hFdkEJ6YMkrQ-SFzGNTaQyCYYYPfdK6oMPeezYlV4gc8rXJAmU2-2RXBUFuc8ZuyTblvBUnhWwM92fTiyRe8-tYKNKVryJzixYYCvIFPCD9n55YgD0T8WAaZLjFxPGDchk_ju8RA2YX1QwdAl7k-Hr8zrzaVqoJfwjZbju5u65fojTr-cNoLsygZWsnPCUdhcw3R1yMNp6isnyHb-DH-hrp_dNKDD4PhQq2-lN14O0RoCNq2811yThejlJ5D4Jryqk1nemMfvfvS1_3IR8aVecl1xn40KWM440BMTZyHjQLppJ-MH8JCuREJB&p=0&fvj=1&vjs=3'}

# Getting every page

In [213]:
while True: 
  try: 
    url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
  except AttributeError:
    break

  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  cards = soup.find_all('div', 'jobsearch-SerpJobCard')

  for card in cards:
    record = get_record(card)
    records.append(record)

# Putting it all together

In [214]:
def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
        writer.writerows(records)

In [219]:
main('data scientist', 'bentonville, ar')

In [220]:
import pandas as pd

In [221]:
df = pd.read_csv('/content/results.csv')

In [222]:
df.head(1000)

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Summary,Salary,JobUrl
0,Lead Data Scientist,TRANZACT,United States,30+ days ago,2021-03-26,A successful candidate will manage data scienc...,"$150,000 - $165,000 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,Senior Data Scientist,"Tyson Foods, Inc.","Springdale, AR",8 days ago,2021-03-26,We are seeking experienced members to join the...,,https://www.indeed.com/rc/clk?jk=a1ceb152e0e8a...
2,Data Scientist II,JB Hunt,"Lowell, AR",30+ days ago,2021-03-26,Assists in analytics project scoping discussio...,,https://www.indeed.com/rc/clk?jk=ddace5a665cbc...
3,Data Scientist,Walmart,"Bentonville, AR",12 days ago,2021-03-26,"Data science, machine learning, optimization m...",,https://www.indeed.com/rc/clk?jk=106b31ad1f6f0...
4,Division Data Science Analyst,Arvest,"Bentonville, AR",8 days ago,2021-03-26,"Utilizes data discovery, diagnostic, predicati...",,https://www.indeed.com/rc/clk?jk=9c4d4e7884bf3...
5,Senior Data Scientist,REPL Consulting Inc,"Bentonville, AR",30+ days ago,2021-03-26,Strong related academic background i.e. Master...,,https://www.indeed.com/rc/clk?jk=52bc7adf99bc4...
6,Senior Data Scientist- Sam's Club,Sam's Club,"Bentonville, AR",2 days ago,2021-03-26,You will use advanced analytics techniques to ...,,https://www.indeed.com/rc/clk?jk=fd56b421f1df7...
7,Senior Data Scientist - Operations Research,Walmart,"Bentonville, AR",17 days ago,2021-03-26,"Option 1- Bachelor’s degree in Statistics, Eco...",,https://www.indeed.com/rc/clk?jk=15da81d3a3c26...
8,Data Scientist - Nationwide Opportunities,"Amazon Web Services, Inc.","Fayetteville, AR",30+ days ago,2021-03-26,Experience in an ML engineer or data scientist...,,https://www.indeed.com/rc/clk?jk=b60071419fecb...
9,Solution Architect - Data & Analytics,Deloitte,"Bentonville, AR",30+ days ago,2021-03-26,Strong understanding of data modeling and data...,,https://www.indeed.com/rc/clk?jk=93c35bf5141a9...
