In [1]:
from bs4 import BeautifulSoup
import requests
import itertools
import csv
import datetime

full_job_list = []

In [2]:
def format_string_for_search(phrase):
    '''takes a string and makes it url friendly'''
    
    return(phrase.strip().replace(' ','+'))

In [3]:
def get_soup_from_url(url):
    '''takes a string url and uses requests and bs4 to return a soup object'''
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    return(soup)

In [4]:
def get_page_cards_details(soup):
    '''scrape all of the job cards for the given page soup and append them to the components list'''
    
    for card in soup.find_all('div', {'class':'row'}):
        job_details = []
        
        # title
        try:
            job_details.append(card.a['title'].strip())
        except:
            job_details.append('Null')
        
        # Company
        try:
            job_details.append(card.find('span', {'class':'company'}).text.strip())
        except:
            job_details.append('Null')
            
        # Location
        try:
            job_details.append(card.find('span', {'class':'location'}).get_text().strip())
        except:
            job_details.append('Null')
            
        # Salary
        try:
            job_details.append(card.find('span', {'class':'salaryText'}).get_text().strip())
        except:
            job_details.append('Null')
        
        # Job Link
        try:
            job_details.append('indeed.co.uk' + card.a['href'])
        except:
            job_details.append('Null')
        
        full_job_list.append(job_details)

In [5]:
def save_jobs(full_job_list):
    doc_date = str(datetime.datetime.now().date())
    
    with open(doc_date, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(full_job_list)

In [6]:
job_title = format_string_for_search('data analyst')
location = format_string_for_search('hinckley')
url = f'https://www.indeed.co.uk/jobs?q={job_title}&l={location}'

while True:
    soup = get_soup_from_url(url)
    get_page_cards_details(soup)
    
    # if there is a next page of jobs get its url from href and get soup from page
    last_nav_item = soup.find('ul', {'class':'pagination-list'}).find_all('li')[-1]
    # print(p_list.prettify())
    if last_nav_item.find('a', {'aria-label':'Next'}) is not None:
        url = 'https://www.indeed.co.uk' + last_nav_item.a['href']
    else:
        break
        
save_jobs(full_job_list)

In [6]:
# for job in full_job_list:
#     print(job)

['Data Analyst', 'Sharpsmart Ltd', 'Null', 'Null', 'indeed.co.uk/pagead/clk?mo=r&ad=-6NYlbfkN0Digduxq_37rmqrdhmWZMKletR_QE0xCo0fOXDcbo7VGEewkFo7z1Wr7uDJn4ZMV2xo2HTjP4VRR9e8hhf_ZtqrirYq3wX9RI44Y39FpY2wvtrOOBRbiA6dHEpBKSLFMGn2w-msE35Rt-o7d4vvnqDIayy9ggCSRsgE0w20Er8JQOhPKztZbO0VW6Qr6IX3SlGk51jPVWa5mQGIQJH01x0BNuly3Hy5dXfv-VXAICqSOW0S6tMRx7zgmdQ8KaESfhC2do3Yf-RKNz8gYPvXyZIrblcy444D7tuPQbawumOClwyJVEBokfHHIgFg-bHVDgVwQnSjNAPYOD08BhTI608Zw31gwLqIF9Y3w3bTf96WmZyIqg_-JEMp06vJUFG1HBXtKcVqrECFk4u7lZpekWDnJoSGpgrkJwpfJUXUyS7FyNm-k9p8NfsNqZhsbIZWFZm8HJqn-oCedZ4fkEpW2fIxmG6R-wqWBqU8fGvV_Pbqpr6M3rIZ6ifRpa1Z1SSBieEgBz-JeEVS5KylpPdaod3HhmO0bPEzXYXXu1kdHU8RXw==&p=0&fvj=1&vjs=3']
['Data Analyst', 'Clockwise Credit Union', 'Null', '£30,000 a year', 'indeed.co.uk/pagead/clk?mo=r&ad=-6NYlbfkN0BtSVgTjwEj6iSDmnSar8u0JgME283KNKaezSbe2aQ7aqkDHKz1dmsxD5y4NVm0wekIi3Jn_R3pehjSPBCokAs-z0xXdMO7sv7sXM_53BtmyKLvUrlAOuqKee8LSV-PJHBLfyV5h4u4KvsxdRG3T53rxMCLdWdix63CwanEfmnzSrh55yyY0xmKseYDVXFtMbcR61SBkQluRSsjyYAWvl506A0