This is a simple webscraping script designed for personal use. This requires 3 user entries (keywords, phrase (optional) and location) to filter job search results.Job details from https://ca.indeed.com/ are scraped automatically from all the search results pages. A csv file containing the extracted data will be saved locally with below column headers:

- Job Title, Company, Location, Salary, Summary, Post Date, Easy Apply and Page URL.

Please note that this script was built to ONLY handle job queries in Canada.

In [1]:
 def get_input():
    keywords=input("Enter job title, keywords, or company : ")
    phrase=input("With the exact phrase : ")
    location=input("Enter city or province in Canada: ")
    return keywords,phrase,location

In [2]:
#Find out how many pages are there in the search results
def count_pages (keywords,phrase,location):
    first_page=build_base_url(keywords,phrase,location)
    load_page= requests.get(first_page,headers={'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
    
    page_content = load_page.content
    soup=BeautifulSoup(page_content,"html.parser")
    
    page_results=soup.find_all("span",{"class":"pn"})
    no_results=soup.find_all("div",{"class":["bad_query","no_results"]})

    if len(no_results)>0:
        return -1
    elif len(page_results)>0:
        num_pages=len(page_results)
    else:
        num_pages=1 
    return num_pages

In [3]:
#Establish the url string based on user input
def build_base_url(keywords, phrase, location, start=None): # default value for parameter

    keywords=requests.utils.quote(keywords)
    phrase=requests.utils.quote(phrase)
    location=requests.utils.quote(location)
    
    url = "https://ca.indeed.com/jobs?q={}&as_phr={}&l={}".format(keywords,phrase,location)
    if start is not None:
        url+="&start={}".format(start)
    return url

In [4]:
#Return an integer for values under Post Date for easy sorting later on
def get_days_ago(date):
    if date in ["Today","Just posted"]:
        return 0
    else:
        return int(date[:2]) 

In [5]:
#Extracts job details from html tags and returns a dictionary
def parse_job_details(item):
    jobdetails={}
    jobdetails["Job Title"]=item.find("div",{"class":"title"}).text.replace("\n","")
    jobdetails["Company"]=item.find("span",{"class":"company"}).text.replace("\n","")
    try:
        jobdetails["Location"]=item.find("span",{"class":"location accessible-contrast-color-location"}).text
        
    except:
        jobdetails["Location"]=""
    try:
        jobdetails["Salary"]=item.find("span",{"class":"salaryText"}).text.replace("\n","")
    except:
        jobdetails["Salary"]=""
    jobdetails["Summary"]=item.find("div",{"class":"summary"}).text.replace("\n","")
    jobdetails["Post Date"]=get_days_ago(item.find("span",{"class":"date"}).text.replace("\n",""))
    try:
        jobdetails["Easy Apply"]=item.find("span",{"class":"iaLabel"}).text.replace("\n","")
    except:
        jobdetails["Easy Apply"]=""
    jobdetails["Page URL"]="https://ca.indeed.com"+item.find("a").get('href')
    return jobdetails

In [6]:
import requests
from bs4 import BeautifulSoup

#For each page in Indeed.com search results, the value after "&start=" is incremented by 10
INCREMENT=10

# User continues to enter keyword, phrase and location until the query results to at least 1 page of job matches.
have_results = False
while not have_results:
    keywords, phrase, location = get_input()
    num_pages = count_pages(keywords, phrase, location)
    if num_pages == -1:
        print ("The search did not match any job. Try other keywords or Canadian locations.")
    if num_pages != -1:
        have_results = True
    

results=[] #main table where all job details are saved

for page in range(0,num_pages):
    url = build_base_url(keywords,phrase,location,page*INCREMENT)

    #Request info for each page in the search results
    callpage=requests.get(url,headers={'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
    savepage=callpage.content
    soup=BeautifulSoup(savepage,"html.parser")

    #Defines the webscraping scope
    all=soup.find_all("div",{"class":"jobsearch-SerpJobCard unifiedRow row result"})   

    #Process each item in soup                          
    for item in all:
        indeed = parse_job_details(item) #returns a dictionary 
        results.append(indeed) #job details are saved as a row in the list


Enter job title, keywords, or company : data scientist
With the exact phrase : 
Enter city or province in Canada: canada


In [7]:
#Converts the dictionary to a dataframe
import pandas
indeed_df=pandas.DataFrame(results)
indeed_df.sort_values(by=['Post Date'], inplace=True,ascending=True)


#Displays the dataframe (showing all job details in a table format)
indeed_df

Unnamed: 0,Job Title,Company,Location,Salary,Summary,Post Date,Easy Apply,Page URL
65,Sr. Data Architect - Machine Learning / Artifi...,Aventura Consultants,,,Data Architect - Machine Learning (ML)/ Artifi...,0,Easily apply,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1,Sr. Data Architect - Machine Learning / Artifi...,Aventura Consultants,,,Data Architect - Machine Learning (ML)/ Artifi...,0,Easily apply,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
79,Sr. Data Architect - Machine Learning / Artifi...,Aventura Consultants,,,Data Architect - Machine Learning (ML)/ Artifi...,0,Easily apply,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
78,"Business Analyst, Strategy & Analytics",Dollarama L.P.,,,"Dollarama employs approximately 20,000 retail ...",0,Easily apply,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...
5,Data Scientist,CITI,"Mississauga, ON",,The candidate will work on existing applicatio...,0,,https://ca.indeed.com/rc/clk?jk=97586b6f5304ec...
6,Data Scientist,FSI,"Acheson, AB",,"With technology and data initiatives underway,...",0,Easily apply,https://ca.indeed.com/rc/clk?jk=ae9988220f6300...
28,Junior Data Analyst (2020-0014),ICTC,"Ottawa, ON",,"This role would you you work in a demanding, h...",0,Easily apply,https://ca.indeed.com/rc/clk?jk=3990c3b056059e...
8,Data Scientist - Machine Learning Engineer,Sonraí Security,"Fredericton, NB",,"Sonrai Security, a cloud based security compan...",0,,https://ca.indeed.com/rc/clk?jk=636f98de510734...
16,Sr. Data Architect - Machine Learning / Artifi...,Aventura Consultants,,,Data Architect - Machine Learning (ML)/ Artifi...,0,Easily apply,https://ca.indeed.com/cmp/Aventura-Consultants
48,Sr. Data Architect - Machine Learning / Artifi...,Aventura Consultants,,,Data Architect - Machine Learning (ML)/ Artifi...,0,Easily apply,https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYl...


In [9]:
#Saves the table locally in a csv format with the filename "Indeed"
indeed_df.to_csv("Indeed.csv")
