# Indeed Job Scraper

### How it works:

You provide a set of standard input parameters: 
- **search query**
- **location**

in addition to two non-standard paramaters: 
- **ordered_keywords**: Job roles are rated based on this ordered list. This is a list of keywords to search for in job descriptions provided in order of preference. 
- **exclude_keywords**: A list of keywords to search for in a job _title_ which renders the rating of that job zero. E.g. if you really hate roles as a recruiter you would include: "Recruitment" or "Headhunter"
- **title_keywords**: A list of keywords to search for in a job _title_ which, if matched for, increase the normalised rating. (Has precedence over "ordered_keywords")
- **pages**: Number of Indeed pages to search. (Maximum that Indeed provides is 100)

The web scraper searches through all the indeed job listings with those paramaters and returns a dataframe containing all the listings ordered by the "rating" metric based on the ordered list of keywords.

You can also then output this dataframe as an excel sheet for convenience. 

In [49]:
# Imports
from bs4 import BeautifulSoup
import requests, json
import pandas as pd
from selenium import webdriver
import re

In [50]:
# Scraping parameters
default_parameters = {
    'search_query':'Quantitative Analyst',
    'location':'New York',
    #'miles':50,
    'ordered_keywords':['Investment','Banking','Finance','Hedge','Python','Fintech','SQL','Analysis','Modelling'],
    'exclude_keywords':['Recruitment','Headhunter','Manager','Director','Senior'],
    'title_keywords':['Graduate','Junior'],
    'pages':1 
}

In [51]:
def create_url(parameters):
    # create base url for all further searches
    what = parameters['search_query'].replace(" ","+")
    where = parameters['location'].replace(" ","+")
    base_url = f"https://www.indeed.com/jobs?q={what}&l={where}"
    return base_url

In [52]:
def rate_job(j_title, j_soup, parameters):
    # rate job by keywords
    description = j_soup.find(id="jobDescriptionText").get_text()
    keywords = parameters['ordered_keywords']
    title_keywords = parameters['title_keywords']
    exclude_keywords = parameters['exclude_keywords']
    total_keywords = len(keywords) + len(title_keywords)
    keywords_present = []
    title_keywords_present = []
    rating = 0
    
    # Check for keyword, add value to rating depending on ranking
    for index,keyword in enumerate(keywords):
        if keyword in description:
            rating += len(keywords) - index
            keywords_present.append(keyword)
    
    # Check for title keywords
    for index,keyword in enumerate(title_keywords):
        if keyword in j_title:
            rating += total_keywords - index
            title_keywords_present.append(keyword)
    
    # Normalise rating
    rating = rating/sum(range(1,total_keywords+1))
    
    # Check for excluded keywords
    for keyword in exclude_keywords:
        if keyword in j_title:
            rating = 0
            break
    
    return description,rating,keywords_present,title_keywords_present

In [53]:
def get_job_details(job,parameters):
    
    # Get link and title
    job_url = job.get('href')
    
    # Correct for truncated URLs
    job_url = "https://www.indeed.com" + job_url if (job_url.startswith("/")) else job_url
    driver = webdriver.Chrome()
    driver.get(job_url)
    job_page = driver.page_source
    job_soup = BeautifulSoup(job_page,'html.parser')
    
    
    # Get job title and company name
    title = job.get_text()
    company = job_soup.find('div', attrs={"data-company-name": "true"}).get_text()
    
    # Get description, rating and present keywords
    description, rating, keywords_present, title_keywords_present = rate_job(title,job_soup,parameters)
    
    return title, company, job_url, description, rating, keywords_present, title_keywords_present

In [54]:
def scrape(parameters):
    
    # Create base url for all further searches
    base_url = create_url(parameters)
    
    # Output list and frame
    output = []
    
    for x in range(0,parameters['pages']):
        if(x==0):
            page_append = ""
        else: 
            page_append = "&start=" + str(x*10)
            
        # get page
        driver = webdriver.Chrome()
        driver.get(base_url+page_append)
        current_page = driver.page_source
        page_soup = BeautifulSoup(current_page,"html.parser")
        
        for job in page_soup.find_all('a', class_=re.compile("JobTitle")):
            title, company, url, description, rating, keywords_present, title_keywords_present = get_job_details(job,parameters)
            output.append([rating,title,company,description,url,keywords_present,title_keywords_present,x+1])
            
        print(f"Page {x+1} completed",end="\r")
        
    df_output_frame = pd.DataFrame(
        output,columns=['Rating','Job Title','Company','Description','Job URL','Keywords Present','Title Keywords','Page Found']).sort_values(
        by='Rating',ascending=False).reset_index(drop=True)

    return df_output_frame



        

In [55]:
'''jobs = scrape(default_parameters)'''

'jobs = scrape(default_parameters)'

In [56]:
'''display(jobs.head())'''

'display(jobs.head())'

# Output to Excel

In [57]:
'''with pd.ExcelWriter('Excel Output.xlsx', engine='openpyxl') as writer:
    jobs.to_excel(writer, index=False)'''

"with pd.ExcelWriter('Excel Output.xlsx', engine='openpyxl') as writer:\n    jobs.to_excel(writer, index=False)"

# Job Matching Algorithm

In [58]:
import csv
import os
from itertools import product

In [59]:
def extract_info_from_spreadsheets(folder_path):
    all_info = {}

    # this code works for files that are in .csv form and downloaded onto local computer
    # may need to change?
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, 'r') as csvfile:
                reader = csv.DictReader(csvfile)

                for row in reader:
                    ID = row['ID']

                    # extract interests
                    interests = row['Interests'].split(',')

                    # extract location
                    location = row['Target locations'].split(',')

                    # extract summer status from "Summer Status"/"Internship" column
                    # (some sheets have this listed as Summer Status, some have it listed as Internship)
                    summer_status = None
                    if 'internship' in row.get('Summer Status', '').lower() or 'internship' in row.get('Internship', '').lower():
                        summer_status = 'internship'
                    elif 'full-time' in row.get('Summer Status', '').lower() or 'full-time' in row.get('Internship', '').lower():
                        summer_status = 'full-time'

                    # store the information in dictionary
                    if ID not in all_info:
                        all_info[ID] = {'Interests': interests, 'Target locations': location, 'Summer Status': summer_status}
                    else:
                        all_info[ID]['Interests'].extend(interests)
                        all_info[ID]['Target locations'].extend(location)


    return all_info

# this is my folder path to all of the IEOR data (.csv)
folder_path = 'D:\OneDrive - Fordham University\Desktop\Career Office\Indeed-Job-Scraper\student_data'
info = extract_info_from_spreadsheets(folder_path)

# printing out informaiton
for ID, data in info.items():
    interests_str = ', '.join(data['Interests'])
    location_str = ', '.join(data['Target locations'])
    summer_status = data['Summer Status']
    print(f"{ID}'s interests are: {interests_str}")
    print(f"{ID}'s target locations are: {location_str}")
    print(f"{ID}'s summer status is: {summer_status}")
    print('-' * 35)

MSBA_01's interests are: Technology/Telecom/Internet,  Consulting,  Marketing - Data Science Professional,  Marketing Analytics Professional,  Product Manager
MSBA_01's target locations are: United States,  India,  United Kingdom
MSBA_01's summer status is: internship
-----------------------------------
MSBA_02's interests are: 
MSBA_02's target locations are: 
MSBA_02's summer status is: None
-----------------------------------
MSBA_03's interests are: 
MSBA_03's target locations are: 
MSBA_03's summer status is: internship
-----------------------------------
MSBA_04's interests are: Technology/Telecom/Internet,  Financial Services,  Consulting,  Data Science Professional,  Management Consultant,  Data Science Professional, 
MSBA_04's target locations are: United States - NY
MSBA_04's summer status is: internship
-----------------------------------
MSBA_05's interests are: 
MSBA_05's target locations are: 
MSBA_05's summer status is: None
-----------------------------------
MSBA_06's 

In [60]:
def generate_combinations(info):
    combinations = []

    for ID, data in info.items():
        interests = data['Interests']
        locations = data['Target locations']
        for interest, loc in product(interests, locations):
            combinations.append({'ID': ID, 'Interest': interest, 'Location': loc})

    return combinations

In [61]:
combinations = generate_combinations(info)

In [62]:
def generate_search_parameters(list_of_dicts, default_parameters):
    search_parameters_dict = {}

    # Scraping parameters
    default_parameters = {
    'search_query':'Quantitative Analyst',
    'location':'New York',
    'miles':50,
    'ordered_keywords':['Investment','Banking','Finance','Hedge','Python','Fintech','SQL','Analysis','Modelling'],
    'exclude_keywords':['Recruitment','Headhunter','Manager','Director','Senior'],
    'title_keywords':['Graduate','Junior'],
    'pages':1 
}

    for item in list_of_dicts:
        # If ID not in dictionary, create a new entry
        if item['ID'] not in search_parameters_dict:
            search_parameters_dict[item['ID']] = []

        search_parameters = default_parameters.copy()  # Copy default parameters
        search_parameters['search_query'] = item['Interest']  # Set search query to Interest

        prefix = item['ID'].split('_')[0]

        search_parameters['ordered_keywords'] = [search_parameters['search_query']]

        if prefix == 'MSOR':
            search_parameters['ordered_keywords'] += ['Operations Research', 'Optimization', 'Mathematical Modeling', 'Simulation Modeling', 'Python', 'Risk Assessment','Statistics ','Probability','Data Analysis', 'Decision Analysis']
        elif prefix == 'MSMSE':
            search_parameters['ordered_keywords'] += ['Management Science', 'Process Optimization', 'Data Analysis', 'Project Management' , 'Risk Assessment','Resource Allocation', 'Econometrics', 'Economics', 'Management', 'Strategic Planning','Strategy']
        elif prefix == 'MSIE':
            search_parameters['ordered_keywords'] += ['Industrial Engineering', 'Optimization', 'Data Analysis', 'Simulation Modeling', 'Problem-Solving', 'Python', 'Pandas', 'Process Improvement' ,'Supply Chain', 'Operations Research', 'Risk', 'Statistics']
        elif prefix == 'MSFE':
            search_parameters['ordered_keywords'] += ['Quantitative', 'Financial Engineering', 'Financial Modeling', 'Machine Learning', 'Data Engineering', 'Statistics', 'Stochastic', 'Risk Management', 'Python', 'SQL']
        elif prefix == 'MSBA':
            search_parameters['ordered_keywords'] += ['Data Analytics', 'Business Analytics', 'Risk Management', 'Wealth Management', 'Portfolio', 'Database', 'Machine Learning', 'Statistics', 'Python', 'SQL', 'Modeling']



        # If 'Location' key exists in the item, update search parameters with it
        if 'Location' in item:
            search_parameters['location'] = item['Location']

        search_parameters_dict[item['ID']].append(search_parameters)

    return search_parameters_dict

# Assuming 'combinations' and 'default_parameters' are defined
search_parameters_dict = generate_search_parameters(combinations, default_parameters)

# Print the search parameter lists for each student
for ID, parameters_list in search_parameters_dict.items():
    print(f"Search parameters for {ID}:")
    for parameters in parameters_list:
        print(parameters)
    print('-' * 50)



Search parameters for MSBA_01:
{'search_query': 'Technology/Telecom/Internet', 'location': 'United States', 'miles': 50, 'ordered_keywords': ['Technology/Telecom/Internet', 'Data Analytics', 'Business Analytics', 'Risk Management', 'Database', 'Machine Learning', 'Statistics', 'Python', 'SQL', 'Modeling'], 'exclude_keywords': ['Recruitment', 'Headhunter', 'Manager', 'Director', 'Senior'], 'title_keywords': ['Graduate', 'Junior'], 'pages': 1}
{'search_query': 'Technology/Telecom/Internet', 'location': ' India', 'miles': 50, 'ordered_keywords': ['Technology/Telecom/Internet', 'Data Analytics', 'Business Analytics', 'Risk Management', 'Database', 'Machine Learning', 'Statistics', 'Python', 'SQL', 'Modeling'], 'exclude_keywords': ['Recruitment', 'Headhunter', 'Manager', 'Director', 'Senior'], 'title_keywords': ['Graduate', 'Junior'], 'pages': 1}
{'search_query': 'Technology/Telecom/Internet', 'location': ' United Kingdom', 'miles': 50, 'ordered_keywords': ['Technology/Telecom/Internet', 'D

In [63]:
# Adding this code will replace "technology/telecom/internet" with just technology
def replace_string(obj):
    if isinstance(obj, str):
        return obj.replace("Technology/Telecom/Internet", "Technology")
    elif isinstance(obj, list):
        return [replace_string(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: replace_string(value) for key, value in obj.items()}
    else:
        return obj

In [64]:
# Replace the specified string in the dictionary values
data = replace_string(search_parameters_dict)

In [65]:
test = search_parameters_dict['MSBA_26'][0]
test

{'search_query': 'Financial Services',
 'location': 'United States - New York',
 'miles': 50,
 'ordered_keywords': ['Financial Services',
  'Data Analytics',
  'Business Analytics',
  'Risk Management',
  'Database',
  'Machine Learning',
  'Statistics',
  'Python',
  'SQL',
  'Modeling'],
 'exclude_keywords': ['Recruitment',
  'Headhunter',
  'Manager',
  'Director',
  'Senior'],
 'title_keywords': ['Graduate', 'Junior'],
 'pages': 1}

In [66]:
jobs = scrape(test)
display(jobs.head())

Page 1 completed

Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,0.128205,Financial Services Representative,Blue Ocean Wealth Solutions,Job Title: Financial Services Representative\n...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,[Financial Services],[],1
1,0.128205,Financial Services Representative,Center for Wealth Preservation,\nInterested in pursuing one of the highest-ea...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,[Financial Services],[],1
2,0.128205,Financial Analyst & Planner,Confidential,About Us: We are a boutique Wealth Management ...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,[Financial Services],[],1
3,0.0,Investment Specialist,The Spaventa Group,The Spaventa Group (“TSG”) is a leading invest...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,[],[],1
4,0.0,Client Service Associate Financial Services,The Pitti Group Wealth Management,Client Service Associate position.\nThe Pitti ...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,[],[],1


In [67]:
with pd.ExcelWriter('Excel Output.xlsx', engine='openpyxl') as writer:
    jobs.to_excel(writer, index=False)

In [68]:
'''def driver_function(search_parameters_dict):
    all_jobs = pd.DataFrame()  # Initialize an empty DataFrame to store all jobs
    
    for ID, parameters_list in search_parameters_dict.items():
        for parameters in parameters_list:
            jobs = scrape(parameters)  # Scrape jobs for the current parameters
            jobs['ID'] = ID  # Add a column for student ID
            all_jobs = pd.concat([all_jobs, jobs], ignore_index=True)  # Concatenate the current jobs DataFrame with all_jobs

        break  # Remove this line to scrape jobs for all students
            
    # Order the DataFrame by rating
    all_jobs_sorted = all_jobs.sort_values(by='Rating', ascending=False).reset_index(drop=True)
    
    return all_jobs_sorted


# Call the driver function
all_student_jobs = driver_function(search_parameters_dict)

# Print the combined DataFrame
print(all_student_jobs)'''

"def driver_function(search_parameters_dict):\n    all_jobs = pd.DataFrame()  # Initialize an empty DataFrame to store all jobs\n    \n    for ID, parameters_list in search_parameters_dict.items():\n        for parameters in parameters_list:\n            jobs = scrape(parameters)  # Scrape jobs for the current parameters\n            jobs['ID'] = ID  # Add a column for student ID\n            all_jobs = pd.concat([all_jobs, jobs], ignore_index=True)  # Concatenate the current jobs DataFrame with all_jobs\n\n        break  # Remove this line to scrape jobs for all students\n            \n    # Order the DataFrame by rating\n    all_jobs_sorted = all_jobs.sort_values(by='Rating', ascending=False).reset_index(drop=True)\n    \n    return all_jobs_sorted\n\n\n# Call the driver function\nall_student_jobs = driver_function(search_parameters_dict)\n\n# Print the combined DataFrame\nprint(all_student_jobs)"