# Indeed.com scraping

In [None]:
# import packages
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from selenium import webdriver
import time

In [None]:
base_url = 'https://nl.indeed.com/jobs?q={}&l={}&start='

def get_url(position, location):
    """Generate a url from position and location"""
    base_url = 'https://nl.indeed.com/jobs?q={}&l={}&start='
    url = base_url.format(position, location)
    return url

# choose job position and location
url = get_url('data analist', 'Nederland')

# create driver object
driver = webdriver.Chrome()
time.sleep(10)

In [None]:
# asses webpage
driver.get(url)
time.sleep(5)

In [None]:
# accept cookies, if applicable
try:
    driver.find_element_by_id('onetrust-accept-btn-handler').click()
    time.sleep(2)
except:
    print('probably accepted the cookie already!')

    
res = driver.page_source.encode('utf-8')
soup = BeautifulSoup(res, "html.parser")

# store job card links in list
jobs = driver.find_elements_by_class_name('jobsearch-SerpJobCard')

In [None]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

def get_record(card):
    """Extract job data from a single record"""
    id = card.find('div', 'recJobLoc').get('id').split('_')[1]
    atag = card.h2.a
    title = atag.get('title')
    url = 'https://nl.indeed.com' + atag.get('href')
    company = card.find('span', 'company').text.strip()
    location = card.find('div', 'recJobLoc').get('data-rc-loc')
    summary = card.find('div', 'summary').text.strip()
    postingdate = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        salary = card.find('span', 'salaryText').text.strip()
    except AttributeError:
        salary = ''
        
    record = {'id':id, 'title':title, 'company':company, 'location':location,
              'postingdate':postingdate, 'today':today, 'summary':summary,
              'salary':salary, 'url':url}
    
    return record

#Create records list; iterate through each card, extracting the record data; append data to the records list
records = []

while True:
    try:
        url = 'https://nl.indeed.com' + soup.find('a',{'aria-label': 'Volgende'}).get('href')
    except AttributeError:
        break
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    
    for card in cards:
        record = get_record(card)
        records.append(record)

# create data frame
cols = list(records[0].keys())

postings_df = pd.DataFrame(columns=cols)
for record in records:
    postings_df = postings_df.append(record, ignore_index=True)
    
# export data frame to csv
postings_df.to_csv('postings.csv')

In [None]:
len(records)

In [None]:
# use Selenium to collect the job descriptions from each job page

descriptions = []
scrapetimesdescriptions = []

# set a lower and upper bound for scraping descriptions, based on len(records),
# as the page will not load anymore at some point due to Captcha
    
start = 0
end = 100
jobids2 = postings_df['id'][start:end]

for jobid in jobids2:
    driver.get('https://nl.indeed.com/vacature-bekijken?jk='+jobid)
    scrapetimesdescriptions.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    time.sleep(3)
    try:
        descriptions.append(driver.find_element_by_id('jobDescriptionText').text.replace('\n',' '))
    except:
        descriptions.append('')

#create data frame
descriptions = pd.DataFrame(index=jobids2, data={'description':descriptions, 'scrapetimedescription':scrapetimesdescriptions})

# export data frame to csv
descriptions.to_csv('descriptions_{}_{}.csv'.format(str(start), str(end)))