In [49]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [50]:
template = "https://www.indeed.com/jobs?q={}&l={}"

In [51]:
def get_url(position, location):
    template = "https://www.indeed.com/jobs?q={}&l={}"
    url = template.format(position, location)
    return url

In [52]:
url = get_url('senior accountant', 'charlotte nc')

In [53]:
url

'https://www.indeed.com/jobs?q=senior accountant&l=charlotte nc'

## Extract raw html

In [54]:
response = requests.get(url)

In [55]:
response

<Response [200]>

In [56]:
response.reason

''

In [57]:
soup = BeautifulSoup(response.text, 'html.parser')

In [58]:
cards = soup.find_all('div', 'job_seen_beacon')

In [59]:
len(cards)

15

## Prototype the model with a single record

In [60]:
card = cards[0]

In [61]:
atag = card.h2.a.span

In [62]:
job_title = atag.get('title')

In [63]:
job_title

'Senior Accountant'

In [64]:
 link = card.h2.a

In [65]:
job_url = 'https://www.indeed.com' + link.get('href')

In [66]:
company=card.find('span', 'companyName').text.strip()

In [67]:
company

'Next Glass, Inc.'

In [70]:
job_location = card.find('div', 'companyLocation').text

In [71]:
job_location

'Remote in Charlotte, NC 28208'

In [78]:
job_summary = card.find('div', 'job-snippet').text.strip().replace('\n',' ')

In [79]:
job_summary

'Prepare and/or review workpapers and schedules to ensure audit readiness. With recent Private Equity investment and five successful acquisitions, Next Glass is…'

In [84]:
post_date = card.find('span', 'date').text[6:]

In [124]:
today = datetime.today().strftime('%Y-%m-%d')

In [125]:
today

'2022-06-25'

In [91]:
try:
    job_salary = card.find('span', 'estimated-salary').text[10:]
except AttributeError:
    job_salary = ''

In [93]:
job_salary

'$78.7K - $99.7K a year'

## Generalize the model with a function

In [96]:
def get_record(card):
    """Extract job data from a single record"""
    atag = card.h2.a.span
    job_title = atag.get('title')
    job_url = 'https://www.indeed.com' + link.get('href')
    company=card.find('span', 'companyName').text.strip()
    job_location = card.find('div', 'companyLocation').text
    job_summary = card.find('div', 'job-snippet').text.strip().replace('\n',' ')
    post_date = card.find('span', 'date').text[6:]
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary = card.find('span', 'estimated-salary').text[10:]
    except AttributeError:
        job_salary = ''
    record = (job_title, company, job_location, post_date, today, job_summary, job_salary, job_url)
    return record  

In [102]:
records = []
for card in cards:
    record = get_record(card)
    records.append(record)

In [103]:
records[0]

('Senior Accountant',
 'Next Glass, Inc.',
 'Remote in Charlotte, NC 28208',
 '18 days ago',
 '2022-06-25',
 'Prepare and/or review workpapers and schedules to ensure audit readiness. With recent Private Equity investment and five successful acquisitions, Next Glass is…',
 '$78.7K - $99.7K a year',
 'https://www.indeed.com/rc/clk?jk=566afbc2856bde69&fccid=300be14ffee74507&vjs=3')

## Getting the next page

In [116]:
while True:
    try:
       url = 'http://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'job_seen_beacon')
    
    for card in cards:
        record = get_record(card)
        records.append(record)

In [117]:
len(records)

75

## Putting it all together

In [128]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_url(position, location):
    template = "https://www.indeed.com/jobs?q={}&l={}"
    url = template.format(position, location)
    return url

def get_record(card):
    """Extract job data from a single record"""
    atag = card.h2.a.span
    job_title = atag.get('title')
    job_url = 'https://www.indeed.com' + link.get('href')
    company=card.find('span', 'companyName').text.strip()
    job_location = card.find('div', 'companyLocation').text
    job_summary = card.find('div', 'job-snippet').text.strip().replace('\n',' ')
    post_date = card.find('span', 'date').text[6:]
    today = datetime.today().strftime('%Y-%m-%d')
    print(today)
    try:
        job_salary = card.find('span', 'estimated-salary').text[10:]
    except AttributeError:
        job_salary = ''
    record = (job_title, company, job_location, post_date, today, job_summary, job_salary, job_url)
    return record  

def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # Extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'job_seen_beacon')

        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
           url = 'http://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
            
    # Save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
        writer.writerows(records)

In [129]:
# run the main program
main('senior accountant', 'charlotte, nc')

2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
2022-06-25
