
# Indeed Job Scraper - Trial Run with job description
create a general purpose job scraper

In [1]:
import re
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint

### Setup the query and url

In [7]:
def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

In [8]:
url = get_url('organizational development', 'charlotte nc')
print(url)

https://www.indeed.com/jobs?q=organizational+development&l=charlotte+nc


### Extract the html data

In [9]:
response = requests.get(url)

In [10]:
soup = BeautifulSoup(response.text, 'html.parser')

In [11]:
#cards = soup.find_all('div', 'mosaic-provider-jobcards')

cards = soup.find_all('a', class_="tapItem")

In [12]:
print(cards)

[<a class="tapItem fs-unmask result job_4b19cf5f7341eadb sponsoredJob resultWithShelf sponTapItem desktop" data-ci="385959398" data-empn="6616844022494418" data-hide-spinner="true" data-hiring-event="false" data-jk="4b19cf5f7341eadb" data-mobtk="1g0781rn3q07f800" href="/pagead/clk?mo=r&amp;ad=-6NYlbfkN0CCrxbiXcrrcnkti5OAxjJFmqdicGijc3TcbLF315CLw53pnmhbQC08b2FtdQe2zf02X0KX2zoRaBYwG7gYm9EwwaK0gWtwNuEgnvh8_0nfV9v8ckU34jn9_LcG6_6kqmpAvmNmvbg3Ugn3Oh3V5VQUsmSH894bksYhWKmOtZMUCnY6mi4T8cR7iSSFh0sdmySlT-lu4IuugDKHd1Bw9oSxmntuj6Iw0NKmM3yqTa2tzTL5VfQYMN0ENNpDZRxWnAx0l402mRWls2w38G0Dwfz_kHNltJW_WNLZQ3eXrAqLZ15Q0eP2dBCr2Bl9jOHURUcYLtBxvxZzoUYFrteIv0iS2-BMXpRWv0Q0CPx67HKQ-hawLGRm4FSq9-Ax5p0PyJDNGxEAxDlxQOApoAcdSK59SyxpEJZ_fpizKGkqT1uMeYzs3PZo2ZU2-eBZnpP8SB2rbypCUccNMexanR-3UgGUbpuymKMMsgPlcjvXgyqXhi1H6ZDeHbN_H_BlENcJIKrKM8eNZmtW-VRizyTAjcxd&amp;p=0&amp;fvj=1&amp;vjs=3" id="sj_4b19cf5f7341eadb" rel="nofollow" target="_blank"><style data-emotion="css 11g4k3a">.css-11g4k3a{box-sizing:border-box;margin:

### Prototype the model with a single record

In [13]:
card = cards[0]

In [14]:
job_title = card.find('h2').get_text()

In [15]:
company = card.find('span', class_='companyName').get_text()

In [16]:
job_location = card.find('div', class_='companyLocation').get_text()

In [17]:
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')

In [19]:
summary = card.find('div', class_='job-snippet').get_text().strip()

In [20]:
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
    salary = salary_tag.text.strip()
else:
    salary = ''    

In [21]:
job_url = 'https://www.indeed.com' + card['href']

#### Getting Job description 

In [38]:
# regex pattern to identify job keys
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")

In [39]:
# request parameters
params = { "q": "oranizational+development", "l": "charlotte+nc", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()

In [40]:
# get the first 10 pages as a test
for x in range(10):
    response = requests.get(url, params=params)
    if not response.status_code == 200:
        break
    else:
        keys = jk_pattern.findall(response.text)
        if len(keys) > 0:
            for key in keys:
                job_keys.add(key)
    
    params['start'] += 20
    sleep(randint(0, 3))

In [41]:
len(job_keys)

150

In [42]:
# job url template
template = "https://www.indeed.com/viewjob?jk={}"

In [43]:
# raw html from job page.... scraping log to be applied with BeautifulSoup here...
jk = job_keys.pop()
jd_url = template.format(jk)
jd_response = requests.get(job_url)

In [44]:
jd_soup = BeautifulSoup(jd_response.text, 'html.parser')

In [45]:
job_description = jd_soup.find("div", id="jobDescriptionText").text

In [46]:
record = (job_title, company, job_location, post_date, today, summary, salary, job_url,job_description)

In [47]:
record

('Summer Tutoring Program Coordinator, In-Person',
 'University Instructors, LLC',
 'Charlotte, NC 28202 (Downtown Charlotte area)',
 'EmployerActive 1 day ago',
 '2022-04-09',
 'Good time management and organizational skills, including prioritizing, scheduling, and adapting as necessary.\nObserve tutors during small group instruction.',
 '',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CCrxbiXcrrcnkti5OAxjJFmqdicGijc3TcbLF315CLw53pnmhbQC08b2FtdQe2zf02X0KX2zoRaBYwG7gYm9EwwaK0gWtwNuEgnvh8_0nfV9v8ckU34jn9_LcG6_6kqmpAvmNmvbg3Ugn3Oh3V5VQUsmSH894bksYhWKmOtZMUCnY6mi4T8cR7iSSFh0sdmySlT-lu4IuugDKHd1Bw9oSxmntuj6Iw0NKmM3yqTa2tzTL5VfQYMN0ENNpDZRxWnAx0l402mRWls2w38G0Dwfz_kHNltJW_WNLZQ3eXrAqLZ15Q0eP2dBCr2Bl9jOHURUcYLtBxvxZzoUYFrteIv0iS2-BMXpRWv0Q0CPx67HKQ-hawLGRm4FSq9-Ax5p0PyJDNGxEAxDlxQOApoAcdSK59SyxpEJZ_fpizKGkqT1uMeYzs3PZo2ZU2-eBZnpP8SB2rbypCUccNMexanR-3UgGUbpuymKMMsgPlcjvXgyqXhi1H6ZDeHbN_H_BlENcJIKrKM8eNZmtW-VRizyTAjcxd&p=0&fvj=1&vjs=3',
 "\nUniversity Instructors (UI) is seeking a dyn

### Generalize the model with a function

In [48]:
def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.find('h2').get_text()
    company = card.find('span', class_='companyName').get_text()
    job_location = card.find('div', class_='companyLocation').get_text()
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', class_='job-snippet').get_text().strip()
    job_url = 'https://www.indeed.com' + card['href']
    job_description = jd_soup.find("div", id="jobDescriptionText").text

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url, job_description)
    return record

In [49]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

### Get the next page

In [50]:
while True:
    try:
        url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('a', class_="tapItem")

    for card in cards:
        record = get_record(card)
        records.append(record)

### Putting it all together

In [51]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.find('h2').get_text()
    company = card.find('span', class_='companyName').get_text()
    job_location = card.find('div', class_='companyLocation').get_text()
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', class_='job-snippet').get_text().strip()
    job_url = 'https://www.indeed.com' + card['href']
    job_description = jd_soup.find("div", id="jobDescriptionText").text

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('a', class_="tapItem")
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results_2.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl','JobDescription'])
        writer.writerows(records)

In [52]:
# run the main program
main('senior accountant', 'charlotte nc')