# Indeed Job Scraper

### How it works:

You provide a set of standard input parameters: 
- **search query**
- **location**

in addition to two non-standard paramaters: 
- **ordered_keywords**: Job roles are rated based on this ordered list. This is a list of keywords to search for in job descriptions provided in order of preference. 
- **exclude_keywords**: A list of keywords to search for in a job _title_ which renders the rating of that job zero. E.g. if you really hate roles as a recruiter you would include: "Recruitment" or "Headhunter"
- **title_keywords**: A list of keywords to search for in a job _title_ which, if matched for, increase the normalised rating. (Has precedence over "ordered_keywords")
- **pages**: Number of Indeed pages to search. (Maximum that Indeed provides is 100)

The web scraper searches through all the indeed job listings with those paramaters and returns a dataframe containing all the listings ordered by the "rating" metric based on the ordered list of keywords.

You can also then output this dataframe as an excel sheet for convenience. 

In [105]:
# Imports
from bs4 import BeautifulSoup
import requests, json
import pandas as pd
from selenium import webdriver
import re

In [106]:
# Scraping parameters
default_parameters = {
    'search_query':'Quantitative Analyst',
    'location':'New York',
    'miles':50,
    'ordered_keywords':['Investment','Banking','Finance','Hedge','Python','Fintech','SQL','Analysis','Modelling'],
    'exclude_keywords':['Recruitment','Headhunter','Manager','Director','Senior'],
    'title_keywords':['Graduate','Junior'],
    'pages':1 
}

In [107]:
def create_url(parameters):
    # create base url for all further searches
    what = parameters['search_query'].replace(" ","+")
    where = parameters['location'].replace(" ","+")
    base_url = f"https://www.indeed.com/jobs?q={what}&l={where}"
    return base_url

In [108]:
def rate_job(j_title, j_soup, parameters):
    # rate job by keywords
    description = j_soup.find(id="jobDescriptionText").get_text()
    keywords = parameters['ordered_keywords']
    title_keywords = parameters['title_keywords']
    exclude_keywords = parameters['exclude_keywords']
    total_keywords = len(keywords) + len(title_keywords)
    keywords_present = []
    title_keywords_present = []
    rating = 0
    
    # Check for keyword, add value to rating depending on ranking
    for index,keyword in enumerate(keywords):
        if keyword in description:
            rating += len(keywords) - index
            keywords_present.append(keyword)
    
    # Check for title keywords
    for index,keyword in enumerate(title_keywords):
        if keyword in j_title:
            rating += total_keywords - index
            title_keywords_present.append(keyword)
    
    # Normalise rating
    rating = rating/sum(range(1,total_keywords+1))
    
    # Check for excluded keywords
    for keyword in exclude_keywords:
        if keyword in j_title:
            rating = 0
            break
    
    return description,rating,keywords_present,title_keywords_present

In [116]:
def get_job_details(job,parameters):
    
    # Get link and title
    job_url = job.get('href')
    
    # Correct for truncated URLs
    job_url = "https://www.indeed.com" + job_url if (job_url.startswith("/")) else job_url
    driver = webdriver.Chrome()
    driver.get(job_url)
    job_page = driver.page_source
    job_soup = BeautifulSoup(job_page,'html.parser')
    
    
    # Get job title and company name
    title = job.get_text()
    company = job_soup.find('div', attrs={"data-company-name": "true"}).get_text()
    
    # Get description, rating and present keywords
    description, rating, keywords_present, title_keywords_present = rate_job(title,job_soup,parameters)
    
    return title, company, job_url, description, rating, keywords_present, title_keywords_present

In [110]:
def scrape(parameters):
    
    # Create base url for all further searches
    base_url = create_url(parameters)
    
    # Output list and frame
    output = []
    
    for x in range(0,parameters['pages']):
        if(x==0):
            page_append = ""
        else: 
            page_append = "&start=" + str(x*10)
            
        # get page
        driver = webdriver.Chrome()
        driver.get(base_url+page_append)
        current_page = driver.page_source
        page_soup = BeautifulSoup(current_page,"html.parser")
        
        for job in page_soup.find_all('a', class_=re.compile("JobTitle")):
            title, company, url, description, rating, keywords_present, title_keywords_present = get_job_details(job,parameters)
            output.append([rating,title,company,description,url,keywords_present,title_keywords_present,x+1])
            
        print(f"Page {x+1} completed",end="\r")
        
    df_output_frame = pd.DataFrame(
        output,columns=['Rating','Job Title','Company','Description','Job URL','Keywords Present','Title Keywords','Page Found']).sort_values(
        by='Rating',ascending=False).reset_index(drop=True)

    return df_output_frame
        

In [117]:
jobs = scrape(default_parameters)

Page 1 completed

In [118]:
display(jobs.head())

Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,0.333333,2025 Summer Analyst Program - Global Markets (...,UBS,\n\n\n\n United States - New York\n \n\n\...,https://www.indeed.com/rc/clk?jk=5fd25119fd4cf...,"[Investment, Banking, Python]",[],1
1,0.287879,Quantitative Investment Analyst,Bank of America,"\nJob Description:\n\n At Bank of America, we ...",https://www.indeed.com/rc/clk?jk=9cf4bdd5b8e16...,"[Investment, Python, SQL, Analysis]",[],1
2,0.257576,Quantitative Analyst,Citi,\nCitigroup Global Markets Inc. seeks a Quanti...,https://www.indeed.com/rc/clk?jk=7873ecc4d9dfa...,"[Finance, Python, SQL, Analysis]",[],1
3,0.257576,Quantitative Analyst,Citi,\nCitigroup Global Markets Inc. seeks a Quanti...,https://www.indeed.com/rc/clk?jk=7a7bbf79448c5...,"[Finance, Python, SQL, Analysis]",[],1
4,0.257576,Quantitative Analyst,American Century Investments,\n\n Our Firm\n \n\n American Century Inves...,https://www.indeed.com/rc/clk?jk=d54ee97f1fdab...,"[Investment, Python, SQL]",[],1


# Output to Excel

In [132]:
with pd.ExcelWriter('Excel Output.xlsx', engine='openpyxl') as writer:
    jobs.to_excel(writer, index=False)