In [1]:
# General Libraries
import numpy as np
import pandas as pd

# Web Scraping Libraries
import urllib
import requests
from bs4 import BeautifulSoup

# Regex Library
import re

# Time-related library
import time

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

**Takeaways**
1. q = job title
2. l = location

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

### Generate the URL of a Job Search at Indeed.com

In [2]:
def first_page_url_indeed(job_title, location):
    '''
    This function returns a URL of the 1st page of a job search at Indeed.com 
    based on the job title and the location.
    '''
    # Create the base URL for a job serch at Indeed.com
    base_url = 'https://www.indeed.com/jobs?'
    # Create a dictionary to map the keys to the input parameters
    dic = {'q': job_title, 'l': location, 'sort': 'date'}
    # Convert the dictionary to a query string
    relative_url = urllib.parse.urlencode(dic)
    # Generate the full URL of the first page
    url = base_url + relative_url
    return url

In [8]:
# Test the function
url = first_page_url_indeed('data scientist', 'tx')
url

'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date'

In [4]:
def urls_indeed(job_title, location):
    '''
    This function returns all the URLs in a job searching result.
    '''
    # Create a variable urls to hold the URLs of all pages
    urls = []
    # Generate the URL of the first page
    first_page_url = first_page_url_indeed(job_title, location)
    # Append the URL of the first page
    urls.append(first_page_url)
    # Generate the Soup object of the first page
    first_page_soup = first_page_soup_indeed(job_title, location)
    # Compute the total number of jobs based on the search
    num_jobs = num_jobs_indeed(first_page_soup) 
    # Estimate the total number of pages based on 15 job cards each page
    num_page = round(int(num_jobs)/15) + 1
    # For Loop through all the pages to generate their URLs
    for i in range(1, num_page+1):
        dic = {'start': i*10}
        relative_url = urllib.parse.urlencode(dic)
        url = first_page_url + '&' + relative_url
        urls.append(url)
    return urls

In [9]:
urls_indeed('data scientist', 'tx')

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Texas | Indeed.com


['https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=10',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=20',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=30',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=40',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=50',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=60',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=70',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=80',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=90',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=100',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=110',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date&start=120',
 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=

### Make the HTTP Request

In [10]:
def first_page_soup_indeed(job_title, location):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of the first page of a request for job searching at Indeed.com
    '''
    # Generate the URL of the job search based on title and location
    url = first_page_url_indeed(job_title, location)
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [11]:
first_page_soup = first_page_soup_indeed("data scientist", 'Tx')
type(first_page_soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Texas | Indeed.com


bs4.BeautifulSoup

In [12]:
# Find out the tag that contains the number of the jobs by seaching

num_jobs = first_page_soup.find('div', id='searchCountPages')
print("Data Type: ", type(num_jobs))
print("Name of the Tag: ", num_jobs.name)
print("Attributes of the Tag: ", num_jobs.attrs)
print("Text within the Tag: ")
num_jobs.text

Data Type:  <class 'bs4.element.Tag'>
Name of the Tag:  div
Attributes of the Tag:  {'id': 'searchCountPages'}
Text within the Tag: 


'\n                    Page 1 of 560 jobs'

In [13]:
# Find the number of the jobs in the text
match = re.findall(r'(\d+)', num_jobs.text)
match[1]

'560'

In [14]:
def num_jobs_indeed(first_page_soup):
    '''
    This function returns the total number of the jobs in the searching result.
    '''
    # Find out the section contains total number of jobs  
    div = first_page_soup.find('div', id='searchCountPages')
    # Extract the number
    num_jobs = re.findall(r'(\d+)', div.text)[1]
    return num_jobs

In [15]:
# Test the function num_jobs_indeed
num_jobs_indeed(first_page_soup)

'560'

In [16]:
def page_num_indeed(soup):
    '''
    This function returns the page number of job searching results. 
    '''
    # Find out the section contains total number of jobs  
    div = soup.find('div', id='searchCountPages')
    # Extract the number
    page_num = re.findall(r'(\d+)', div.text)[0]
    return page_num

In [17]:
# Test the function num_jobs_indeed
page_num_indeed(first_page_soup)

'1'

In [18]:
# Define a function to extract all job cards in a Indeed page

def job_cards_indeed(soup):
    '''
    This function accepts the Soup object of a Indeed page 
    return an iterator containing the all the job cards in this page.
    '''
    # Find the appropriate tag that contains all of the job listings in this page
    tag = soup.find('td', id="resultsCol")
    # Extract all job cards
    job_cards = tag.find_all('div', class_='jobsearch-SerpJobCard')
    return job_cards

In [19]:
job_cards = job_cards_indeed(first_page_soup)
# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

**Quick Note**: job_cards is an iterator

In [20]:
# How many jobs listed in the 1st page? 
len(job_cards)

15

In [21]:
def job_titles_indeed(job_cards):
    '''
    This function extract the job titles from a job_cards set. 
    '''
    # Create a list to hold the job titles
    titles = []
    # For Loop throught the job cards to extract the titles
    for job in job_cards:
        title = job.find('h2', class_='title')
        title = title.text.strip()
        titles.append(title)
    return titles

In [22]:
titles = job_titles_indeed(job_cards)
titles

['Sr. Associate, Data Science - Machine Learning\nnew',
 'Intern, Data Science\nnew',
 'CIB Wholesale Payments Data and Analytics - Data Scientist L...\nnew',
 'Senior Director, Statistical Innovation & Data Science (Clin...\nnew',
 'Senior Software Engineer (Analytics and Machine Learning)\nnew',
 'Senior AI/ML Engineer\nnew',
 'Artificial Intelligence, Consultant - Applied Artificial Int...\nnew',
 'Senior Data Scientist\nnew',
 'Data Analyst Specialist (Work-From-Home)\nnew',
 'IT Data Science Analyst\nnew',
 'NLP engineer\nnew',
 'Single Family - Data Science - Associate\nnew',
 'Commercial Banking - Senior Data Scientist - VP\nnew',
 'AI Model Development Engineer\nnew',
 'Cloud Data Services Associate\nnew']

In [23]:
# Define a function to pull the company names from a set of job cards

def company_names_indeed(job_cards):
    '''
    This function extracts the company names from a set of job cards.
    '''
    # Create a list to hold the company names
    names = []
    # For loop through the job cards to pull the company names
    for job in job_cards:
        name = job.find('span', class_='company')
        name = name.text.strip()
        names.append(name)
    return names

In [24]:
# Test the function: comany_names_indeed

company_names = company_names_indeed(job_cards)
company_names

['Capital One',
 'Rockwell Automation',
 'JPMorgan Chase Bank, N.A.',
 'Cytel, Inc (USA)',
 'Cvent',
 'CAPCO',
 'Deloitte',
 'dMASS',
 'Robin Healthcare',
 'Bray International, Inc.',
 'raw',
 'Fannie Mae',
 'JPMorgan Chase Bank, N.A.',
 'Mythic-AI',
 'PwC']

In [25]:
# Define a function to pull the post ages from a set of job cards

def post_ages_indeed(job_cards):
    '''
    This function pulls the post ages from a set of job cards.
    '''
    # Create a list to hold the post ages
    ages = []
    # For loop through the job cards to pull the post ages
    for job in job_cards:
        age = job.find('span', class_='date')
        age = age.text.strip()
        ages.append(age)
    return ages

In [26]:
# Test the function: post_ages_indeed
ages = post_ages_indeed(job_cards)
ages

['Just posted',
 'Just posted',
 'Just posted',
 'Just posted',
 'Just posted',
 'Just posted',
 'Today',
 'Today',
 '1 day ago',
 '1 day ago',
 '1 day ago',
 '2 days ago',
 '2 days ago',
 '2 days ago',
 '4 days ago']

In [27]:
# Define a function to pull the location from a set of job cards

def job_locations_indeed(job_cards):
    '''
    This function pulls the job locations from a set of job cards.
    '''
    # Create a list to hold the locations
    locations = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        location = job.find('div', class_='location accessible-contrast-color-location')
        if location == None:
            location = job.find('span', class_='location accessible-contrast-color-location')
        location = location.text.strip()
        locations.append(location)
    return locations

In [28]:
locations = job_locations_indeed(job_cards)
locations

['Plano, TX 75023',
 'Austin, TX',
 'Plano, TX',
 'Texas',
 'Texas',
 'Houston, TX',
 'Austin, TX',
 'Austin, TX 78701 (Downtown area)',
 'Austin, TX',
 'Houston, TX 77041',
 'Irving, TX 75016',
 'Plano, TX',
 'Plano, TX',
 'Austin, TX',
 'San Antonio, TX 78206 (King William area)']

In [29]:
# Define a function to pull the company ratings from a set of job cards

def company_rating_indeed(job_cards):
    '''
    This function pulls the company rating from a set of job cards.
    If the rating is unavailable, it will be marked as 'missing'.
    '''
    # Create a list to hold the locations
    ratings = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        rating = job.find('span', class_='ratingsContent')
        if rating == None:
            ratings.append('missing')
            continue
        rating = rating.text.strip()
        ratings.append(rating)
    return ratings

In [30]:
ratings = company_rating_indeed(job_cards)
ratings

['3.9',
 '3.9',
 '3.9',
 '4.5',
 '3.6',
 '3.5',
 '4.0',
 'missing',
 '3.8',
 '2.6',
 'missing',
 '4.0',
 '3.9',
 'missing',
 '4.0']

In [31]:
def acuqire_indeed_job_description(url):
    '''
    This function accepts the URL of a job posting and pull its description.
    '''
    # Make the HTTP request
    request = requests.get(url)
    print("Status Code: ", request.status_code)
    # Take a break
    time.sleep(5)
    # Make a soup variable holding the response content
    soup = BeautifulSoup(request.content, "html.parser")
    if soup == None:
        description = 'error'
    else:
        # Print the page's title
        print(soup.title.string)
        # Find the section that contains job description
        description = soup.find('div', id="jobDescriptionText")
        if description == None:
            description = 'error'
        else:
            description = description.text
    return description

def job_links_and_contents_indeed(job_cards):
    '''
    This function pulls the job links and descriptions from a set of job cards.
    '''
    # Create a list to hold the links and descriptions
    links = []
    descriptions = []
    # For loop through the job cards to pull the links and descriptions
    for job in job_cards:
        link = job.find('a')['href']
        link = 'https://www.indeed.com' + link
        link = link.replace(';', '&')
        description = acuqire_indeed_job_description(link)
        links.append(link)
        descriptions.append(description)
    return links, descriptions

In [32]:
# Test the function: job_links_and_contents_indeed
links, descriptions = job_links_and_contents_indeed(job_cards)

Status Code:  200
Sr. Associate, Data Science - Machine Learning - Plano, TX 75023 - Indeed.com
Status Code:  200
Intern, Data Science - Austin, TX - Indeed.com
Status Code:  200
CIB Wholesale Payments Data and Analytics - Data Scientist Lead - Plano, TX - Indeed.com
Status Code:  200
Senior Director, Statistical Innovation & Data Science (Clinical Trial Design) - Texas - Indeed.com
Status Code:  200
Senior Software Engineer (Analytics and Machine Learning) - Texas - Indeed.com
Status Code:  200
Senior AI/ML Engineer - Houston, TX - Indeed.com
Status Code:  200
Artificial Intelligence, Consultant - Applied Artificial Intelligence - Austin, TX - Indeed.com
Status Code:  200
Senior Data Scientist - Austin, TX 78701 - Indeed.com
Status Code:  200
Data Analyst Specialist (Work-From-Home) - Austin, TX - Indeed.com
Status Code:  200
IT Data Science Analyst - Houston, TX 77041 - Indeed.com
Status Code:  200
NLP engineer - Irving, TX 75016 - Indeed.com
Status Code:  200
Single Family - Data Sc

In [33]:
links

['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0C3j_zLGvpMLCdiZ0WC46XqVTA1VMZzOzKXPhAXwYlrNag_LLRX9rPXfbLEbI4C0P49sxzBZwM79MdEkCWfDF7fHP-DQMHOTAbkJ1h7_NaF_PepPdFItbxIPXAHIBR3Gx6Grlo30x6cANZ81229sjdTjX5u6PApWEhTl-73SQokWu9unYRlfz4xWpbZ96BdLFvZlPJUTJQi20sKQ_IJ7nK1_fNSmLkElHGXRDPaMCb-0BZTXng4KBGVdo5JGahHccSpzKQRkHqYN-w-TeCTz-dIe8Kkd4uRwzfBtMEuSv8f4mswmicl-NgPD12D4rZtuXPQkq1YwOWNco5agukPHlFLHc0lrDzGKv3iCxTe2Y1TQmECeipX7qa0&p=0&fvj=0&vjs=3',
 'https://www.indeed.com/rc/clk?jk=aad4e6fbe98bb6cc&fccid=a4b7e90c6a891db3&vjs=3',
 'https://www.indeed.com/rc/clk?jk=6dd19608daec7cdd&fccid=aaf3b433897ea465&vjs=3',
 'https://www.indeed.com/rc/clk?jk=c850aafd77bd973b&fccid=0ca0607c69909b05&vjs=3',
 'https://www.indeed.com/rc/clk?jk=48d6a46092063e0f&fccid=9e35787f589fa60f&vjs=3',
 'https://www.indeed.com/rc/clk?jk=896626f64a289715&fccid=c2a63affe8751868&vjs=3',
 'https://www.indeed.com/rc/clk?jk=c918116c06b236bc&fccid=9e215d88a6b33622&vjs=3',
 'https://www.indeed.com/company/dMASS/jobs/Senior-Data

In [34]:
descriptions[0]

"Plano 7 (31067), United States of America, Plano, Texas\nSr. Associate, Data Science - Machine Learning\nAs a Data Scientist at Capital One's Auto Finance business, you’ll be part of a high performing modeling, analytics, product development team that’s leading the next wave of disruption at a whole new scale. Our team has relentless focus on the craft of modeling and innovation, using the latest in computing and machine learning technologies and operating across billions of customer records to unlock the big opportunities that help everyday people save money, time and agony in their financial lives. You will utilize your strong technical skills to scale machine learning models and products into real-time through designing, building complex data pipelines, and deploying models in our leading-edge cloud-based technology platforms. You will also have the opportunity to develop open-source packages and tools for our community and outside Capital One.\nTeam Description:\nCapital One's Aut

In [35]:
# Define a function to create a Soup object based on a job search url

def page_soup_indeed(url):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of a page for a job searching results at Indeed.com
    '''
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [36]:
# Test the function: page_soup_indeed

url = 'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date'
soup = page_soup_indeed(url)
type(soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Texas | Indeed.com


bs4.BeautifulSoup

In [37]:
# Find out the page number
page_num_indeed(soup)

'1'

In [38]:
# Pull the job cards from the soup
type(job_cards_indeed(soup))

bs4.element.ResultSet

In [39]:
# Define a function to pull job information from a job search URL

def acquire_page_indeed(url):
    '''
    This function accepts a job search URL and returns the page number and
    a pandas dataframe containing job title, location, company, company rating, 
    post age and description. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Pull the page number
    page_num = page_num_indeed(soup)
    # Pull the job cards
    job_cards = job_cards_indeed(soup)
    # Pull the job titles
    titles = job_titles_indeed(job_cards)   
    # Pull the names of the companies
    companies = company_names_indeed(job_cards)
    # Pull the post ages
    ages = post_ages_indeed(job_cards)
    # Pull the job locations
    locations = job_locations_indeed(job_cards)
    # Pull the company ratings
    ratings = company_rating_indeed(job_cards)
    # Pull the hyperlinks and job description
    links, descriptions = job_links_and_contents_indeed(job_cards)    
    # Create a dataframe
    d = {'title': titles,
         'locations': locations,
         'company': companies, 
         'company_rating': ratings,
         'post_age': ages, 
         'job_link': links, 
         'job_description': descriptions}
    df = pd.DataFrame(d)
    return page_num, df

In [40]:
# Test function acquire_page_indeed
page_num, df = acquire_page_indeed(url)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Texas | Indeed.com
Status Code:  200
Sr. Associate, Data Science - Machine Learning - Plano, TX 75023 - Indeed.com
Status Code:  200
Intern, Data Science - Austin, TX - Indeed.com
Status Code:  200
CIB Wholesale Payments Data and Analytics - Data Scientist Lead - Plano, TX - Indeed.com
Status Code:  200
Senior Director, Statistical Innovation & Data Science (Clinical Trial Design) - Texas - Indeed.com
Status Code:  200
Senior Software Engineer (Analytics and Machine Learning) - Texas - Indeed.com
Status Code:  200
Senior AI/ML Engineer - Houston, TX - Indeed.com
Status Code:  200
Artificial Intelligence, Consultant - Applied Artificial Intelligence - Austin, TX - Indeed.com
Status Code:  200
Senior Data Scientist - Austin, TX 78701 - Indeed.com
Status Code:  200
Data Analyst Specialist (Work-From-Home) - Austin, TX - Indeed.com
Status Code:  200
IT Data Science An

In [41]:
# Print the page number
page_num

'1'

In [42]:
df.job_description[14]

"A career in our Digital and Applications Design practice, within Application and Emerging Technology services, will provide you with a unique opportunity to help our clients identify and prioritise emerging technologies that can help solve their business problems. We help our clients design approaches to integrate new technologies, skills, and processes so they can get the most out of their technology investment and drive business results and innovation. Our team helps organisations align their business and operational requirements through the careful design of digital platforms and applications. You’ll help our clients with application optimisation, strategic integration of custom packaged solutions like Enterprise Resource Planning and Customer Relationship Management, and roadmap development.\n\nTo really stand out and make us fit for the future in a constantly changing world, each and every one of us at PwC needs to be a purpose-led and values-driven leader at every level. To help

In [43]:
def jobs_indeed(job_title, location):
    '''
    '''
    # Generate the urls based on job title and location (state)
    urls = urls_indeed(job_title, location)
    # Set up an counter
    counter = 0
    # Create an empty dataframe to hold the job information
    df_jobs = pd.DataFrame()
    # For loop through the urls to pull job information
    for url in urls:
        counter = counter+1
        page_num, df = acquire_page_indeed(url)
        print("--------------------------------")
        print("Page: ", counter)
        print("--------------------------------")
        time.sleep(180)
        if int(page_num) == counter:
            df_jobs = df_jobs.append(df)
            continue
        if int(page_num) < counter:
            break
    # Print the total number of jobs
    print(f"Total number of {job_title} positions in {location}: ", df_jobs.shape[0])
    return df_jobs

In [None]:
# Test function jobs_in_state_indeed
df = jobs_indeed('data scientist', 'tx')

In [49]:
# Print a concise summary of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454 entries, 0 to 13
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            454 non-null    object
 1   locations        454 non-null    object
 2   company          454 non-null    object
 3   company_rating   454 non-null    object
 4   post_age         454 non-null    object
 5   job_link         454 non-null    object
 6   job_description  454 non-null    object
dtypes: object(7)
memory usage: 28.4+ KB
