In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
from tqdm import tqdm

import pandas as pd
import numpy as np
import re
import glob

from time import time, sleep
from datetime import datetime
from random import randrange

### Exploring Page

This is where the relevant information is stored. The html id's can be found on the page using Inspect

In [2]:
URL = "https://www.indeed.co.uk/jobs?q=Data+Scientist&l=United+Kingdom&sort=date&start=0"
r = requests.get(URL).text
soup = BeautifulSoup(r, 'html.parser')

In [3]:
# find pages to use later
soup.find_all('div', attrs={'id':'searchCountPages'})

[<div id="searchCountPages">
                     Page 1 of 1,264 jobs</div>]

In [4]:
# this is each card
soup.find_all('div', attrs={'class':'jobsearch-SerpJobCard unifiedRow row result'})[0].text[:5]

'\n\n\nDa'

In [5]:
# this is each job title
soup.find_all('h2', attrs={'class':'title'})[0].text

'\n\nData Scientist\nnew'

In [6]:
# this is the company
soup.find_all('span', attrs={'class':'company'})[0].text

'\nCKM Analytix'

In [7]:
# this is the location
soup.find_all('span', attrs={'class':'location accessible-contrast-color-location'})[0].text

'London'

In [None]:
# this is also the location - some records do not have this
soup.find_all('div', attrs={'class':'location accessible-contrast-color-location'})[0].text

In [9]:
# this is the salary
soup.find_all('span', attrs={'class':'salaryText'})[0].text

'\n£35,000 - £37,000 a year'

In [10]:
# this is the full job url
soup.find_all('a', attrs={'target':'_blank'},href=True)[0]['href']

'/rc/clk?jk=68ad3a99f6005928&fccid=dd616958bd9ddc12&vjs=3'

In [11]:
# these are the first bullet points
soup.find_all('ul', attrs={'style':'list-style-type:circle;margin-top: 0px;margin-bottom: 0px;padding-left:20px;'})[0].text

'\nPassion for quantitative problem solving and developing data driven solutions to difficult business questions.\n'

### Scraping Function

This function loops through all of the pages on Indeed. The total number of pages is also scraped to ensure duplicates aren't created. Wait times are included to prevent the website from blocking incoming requests.

In [12]:
def scrape_job_description(url,jobs):
    
    title = []
    company = []
    location = []
    location2 = []
    salary = []
    summary = []
    urls = []

    for job in tqdm(jobs):
        url2 = url.format(job,0)
        r = requests.get(url2)
        soup = BeautifulSoup(r.text, 'html.parser')
        total = int(re.findall(r'\d*[^A-z\s]',soup.find('div', attrs={'id':'searchCountPages'}).text.replace(',',''))[1])

        for i in range(0,total,50):

            start = time()
            while time()-start<randrange(4,7): continue

            url3=url.format(job,i)
            r = requests.get(url3)
            soup = BeautifulSoup(r.text, 'html.parser')

            for i in soup.find_all('div', attrs={'class':'jobsearch-SerpJobCard unifiedRow row result'}):

                try:
                    title.append(i.find('h2', attrs={'class':'title'}).text)
                except:
                    title.append(np.nan)

                try:
                    company.append(i.find('span', attrs={'class':'company'}).text)
                except:
                    company.append(np.nan)

                try:
                    location.append(i.find('span', attrs={'class':'location accessible-contrast-color-location'}).text)
                except:
                    location.append(np.nan)

                try:
                    location2.append(i.find('div', attrs={'class':'location accessible-contrast-color-location'}).text)
                except:
                    location2.append(np.nan)

                try:
                    salary.append(i.find('span', attrs={'class':'salaryText'}).text)
                except:
                    salary.append(np.nan)

                try:
                    summary.append(i.find('ul', attrs={'style':'list-style-type:circle;margin-top: 0px;margin-bottom: 0px;padding-left:20px;'}).text)
                except:
                    summary.append(np.nan)

                try:
                    urls.append(i.find('a', attrs={'target':'_blank'},href=True)['href'])
                except:
                    urls.append(np.nan)

    data = pd.DataFrame(zip(title,company,location,location2,salary,summary,urls))
    data.columns = ['Title','Company','Location1','Location2','Salary','Summary','url']
    data.drop_duplicates(inplace=True)
    data.dropna(subset=['Salary'],inplace=True)
    print(len(data))
    uuid = datetime.now().strftime("%Y%m%d%H%M")
    data.to_csv(f'./scraped_data/base/scraped_jobs_{uuid}.csv',index=False)

### Individual Company Scraping

A similar methodology as above is employed but the wait times are longer here as we have to visit each individual company page.

In [39]:
def scrape_company_data(company_list):
    a=[]
    b=[]
    for j in range(0,len(companies),200):
        start1 = time()
        while time()-start1<300: continue
        for i in tqdm(companies[j:j+200]):
            start = time()
            while time()-start<randrange(5,10): continue
            url = 'https://www.indeed.co.uk/cmp/{}'.format(i)
            r = requests.get(url).text
            soup = BeautifulSoup(r, 'html.parser')
            try:
                a.append([i.text for i in soup.find_all('div', attrs={'class':'cmp-AboutMetadata-itemTitle'})])
            except:
                a.append(np.nan)
            try:
                b.append([i.text for i in soup.find_all('div', attrs={'class':'cmp-AboutMetadata-itemCotent'})])
            except:
                b.append(np.nan)
    uuid = datetime.now().strftime("%Y%m%d%H%M")
    company_info = pd.DataFrame(zip(companies,a,b))
    company_info.to_csv(f'./scraped_data/companies/company_data_{uuid}.csv',index=False)

### Scraping full descriptions

Full job descriptions are scrape here, this is also slow due to visiting every page

In [5]:
def scrape_full_desc(url_list):
    a=[]
    for j in range(0,len(url_list),200):
        start1 = time()
        while time()-start1<360: continue
        for i in tqdm(url_list[j:j+200]):
            start = time()
            while time()-start<randrange(7,11): continue
            url = 'https://www.indeed.co.uk{}'.format(i)
            r = requests.get(url).text
            soup = BeautifulSoup(r, 'html.parser')
            try:
                a.append([i.text for i in soup.find_all('div', attrs={'id':'jobDescriptionText'})])
            except:
                a.append(np.nan)
    full_desc = pd.DataFrame(zip(url_list,a))
    uuid = datetime.now().strftime("%Y%m%d%H%M")
    full_desc.to_csv(f'./scraped_data/descriptions/full_desc_{uuid}.csv',index=False)

In [55]:
url = "https://www.indeed.co.uk/jobs?q={}&l=United+Kingdom&sort=date&start={}&limit=50&filter=0"
jobs = ['data+scientist','data+analyst','data+engineer',
        'machine+learning+engineer','decision+scientist',
        'BI+analyst','visualisation+analyst','business+intelligence']

In [34]:
scrape_job_description(url,jobs)

100%|██████████| 8/8 [18:01<00:00, 135.13s/it]

3085





In [42]:
path = r'scraped_data/base'
all_files = glob.glob(path + "/*.csv")
df_list = [pd.read_csv(file, index_col=None, header=0) for file in all_files]
df = pd.concat(df_list, axis=0, ignore_index=True)

In [43]:
companies = df['Company'].str.replace(' ','+').str.replace('\n','').unique()

In [44]:
scrape_company_data(companies)

100%|██████████| 200/200 [17:22<00:00,  5.21s/it]
100%|██████████| 200/200 [17:21<00:00,  5.21s/it]
100%|██████████| 200/200 [17:19<00:00,  5.20s/it]
100%|██████████| 200/200 [17:19<00:00,  5.20s/it]
100%|██████████| 200/200 [17:18<00:00,  5.19s/it]
100%|██████████| 200/200 [17:17<00:00,  5.19s/it]
100%|██████████| 200/200 [17:22<00:00,  5.21s/it]
100%|██████████| 200/200 [17:17<00:00,  5.19s/it]
100%|██████████| 200/200 [17:19<00:00,  5.20s/it]
100%|██████████| 200/200 [17:26<00:00,  5.23s/it]
100%|██████████| 197/197 [17:14<00:00,  5.25s/it]


In [6]:
df2 = pd.read_csv('rescrape.csv')
url_list = df2['url'].values

In [7]:
scrape_full_desc(url_list)

100%|██████████| 200/200 [25:23<00:00,  7.62s/it]
100%|██████████| 200/200 [25:38<00:00,  7.69s/it]
100%|██████████| 200/200 [25:45<00:00,  7.73s/it]
100%|██████████| 200/200 [25:09<00:00,  7.55s/it]
100%|██████████| 200/200 [25:07<00:00,  7.54s/it]
100%|██████████| 159/159 [19:50<00:00,  7.49s/it]
