In [1]:
import os
import bs4
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def generate_link(job, location):
    j = job.lower().split()
    l = location.lower()
    return f'https://nl.indeed.com/jobs?q={j[0]}+{j[1]}&l={l}'

In [3]:
def generate_soup(url, page):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
    URL = f'{url}&start={page}'
    r = requests.get(URL, headers)
    soup = BeautifulSoup(r.content, "html.parser")
    return soup

In [4]:
def extract_descriptions(link):
    soup = generate_soup(link, 0)
    text = soup.find('div', class_ = 'jobsearch-JobComponent-description icl-u-xs-mt--md')
    return text

In [5]:
def transform(soup, joblist):
    """
    appends information new vacancies from given webpage (soup) to given joblist
    """
    base = 'https://nl.indeed.com'
    divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
    # loop over vacancies on webpage
    for item in divs:
        title = item.find('a').text.strip()
        company = item.find('span', class_ = 'company').text.strip()
        link = base + item.find('a').get('href')
        try:
            description = extract_descriptions(link)
        except:
            description = None
        # make placeholder for the id
        iden = '_'
        job = {
            'id': iden,
            'title': title,
            'company': company,
            'link': link,
            'description': description
        }
        joblist.append(job)
    return joblist

In [6]:
def job_to_joblist(job, location, n_pages):
    joblist = []
    # reformated to indeed page format 
    n = (n_pages*10)-10
    link = generate_link(job, location)
    # loop over pages and extract
    for i in range(0, n, 10):
        jobs = generate_soup(link, i)
        transform(jobs, joblist)
    return joblist, job

In [7]:
def add_id(df, job):
    rangelist = list(range(len(df)))
    # assign prefix based on job
    if job.split()[1][:3] == 'ste':
        prefix = 'DSP'
    elif job.split()[1][:3] == 'sci':
        prefix = 'DS'
    else:
        prefix = 'DOC'
    idlist = [prefix + '_' + str(num) for num in rangelist]
    df['id'] = idlist
    return df

In [8]:
def joblist_to_dataframe(joblist, job):
    df = pd.DataFrame(joblist)
    path = os.path.abspath(os.getcwd())
    # remove duplicate vacancies
    df = df.drop_duplicates(subset=['title', 'company'], keep='first').reset_index(drop=True)
    # remove rows with nan value
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    # add id to the dataframe
    df = add_id(df, job)
    # store the dataframe locally
    df.to_csv(f'{path}\dataframe_{job}.csv', index=False, header=True)
    return df

In [9]:
joblist, job = job_to_joblist('data steward', 'amsterdam', 3)

In [10]:
df = joblist_to_dataframe(joblist, job)

In [11]:
df

Unnamed: 0,id,title,company,link,description
0,DSP_0,Master Data Steward/Analyst,Amazing Oriental,https://nl.indeed.com/rc/clk?jk=8c43b4b47a79b9...,"[[], [], [[], [<p><b><i>Your Responsibilities:..."
1,DSP_1,HR Officer (32-36),Briddge Legal & Finance,https://nl.indeed.com/rc/clk?jk=80aa0ba17a4848...,"[[], [], [[], [<div><div><p>The Renewal Worksh..."
2,DSP_2,Assistant Professor in Data Management Methodo...,Universiteit van Amsterdam,https://nl.indeed.com/rc/clk?jk=c2a04b6ad75ab4...,"[[], [], [[<div><div><div>Vacancy details of A..."
3,DSP_3,Bi Specialist,EVBox,https://nl.indeed.com/rc/clk?jk=0c572fb57d2630...,"[[], [], [We're looking for a BI Specialist to..."
4,DSP_4,Data Engineer,Amazing Oriental,https://nl.indeed.com/rc/clk?jk=e295a29af2e4f6...,"[[], [], [[], [<div><p>As a Data Engineer, you..."
5,DSP_5,Product Owner Data Management Mortgages,ING,https://nl.indeed.com/rc/clk?jk=deca9994e792aa...,"[[], [], [[], [<div><p><b>The Unite Tribe Data..."
6,DSP_6,Head of Country Marketing - EMEA,Invesco,https://nl.indeed.com/rc/clk?jk=520067950c9eec...,"[[], [], [[<p>As one of the world’s leading as..."
7,DSP_7,Data Consultant,Oliver James Associates,https://nl.indeed.com/rc/clk?jk=0aa484f4dfdc81...,"[[], [], [[<p>For an well respected client of ..."
8,DSP_8,Tech Lead Retail Data,albert-heijn,https://nl.indeed.com/rc/clk?jk=85c82ad8176310...,"[[], [], [[<div><h2 class=""jobSectionHeader""><..."
9,DSP_9,Data Engineer,Carrierecafe,https://nl.indeed.com/rc/clk?jk=f70cb6470c9974...,"[[], [], [[<p>Zonder de Data Engineer geen wer..."
