In [87]:
# General Libraries
import numpy as np
import pandas as pd

# Web Scraping Libraries
import urllib
import requests
from bs4 import BeautifulSoup

# Regex Library
import re

from tqdm.notebook import tqdm

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

**Takeaways**
1. q = job title
2. l = location

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

### Generate the URL of a Job Search at Indeed.com

In [5]:
def url_indeed(job_title, location):
    '''
    This function returns a URL of a job search at Indeed.com 
    based on the job title and the location.
    '''
    # Create the base URL for a job serch at Indeed.com
    base_url = 'https://www.indeed.com/jobs?'
    # Create a dictionary to map keys to the input parameters
    dic = {'q': job_title, 'l': location, 'sort': 'date'}
    # Convert the dictionary to a query string
    relative_url = urllib.parse.urlencode(dic)
    # Generate the full URL
    url = base_url + relative_url
    return url

In [72]:
# Test the function
url = url_indeed('data scientist', 'tx')
url

'https://www.indeed.com/jobs?q=data+scientist&l=tx&sort=date'

### Make the HTTP Request

In [73]:
# Make the HTTP request
response = requests.get(url)

# Print the status code
print("Status Code: ", response.status_code)
    
# Sanity check to make sure the document type is HTML
print(response.text[:100])

Status Code:  200
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<meta http-equiv="content-type" content="text/html


In [74]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.content, "html.parser")
print(type(soup))
print("Soup Title: ",soup.title.string)

<class 'bs4.BeautifulSoup'>
Soup Title:  Data Scientist Jobs, Employment in Texas | Indeed.com


In [86]:
# Find out the tag that contains the number of the jobs by seaching

num_jobs = soup.find('div', id='searchCountPages')
print("Data Type: ", type(num_jobs))
print("Name of the Tag: ", num_jobs.name)
print("Attributes of the Tag: ", num_jobs.attrs)
print("Text within the Tag: ")
num_jobs.text

Data Type:  <class 'bs4.element.Tag'>
Name of the Tag:  div
Attributes of the Tag:  {'id': 'searchCountPages'}
Text within the Tag: 


'\n                    Page 1 of 578 jobs'

In [113]:
# Find the number of the jobs in the text
match = re.findall(r'(\d+)', num_jobs.text)
match[1]

'578'

In [76]:
# Find out the number of HTML pages
num_pages = soup.find_all('span', class_='pn')

for num in num_pages:
    print(num.string)

2
3
4
5
None


In [16]:
# Find the appropriate <div> that contains all of the job listings in the 1st page
soup_jobs = soup.find(id="resultsCol")

# Print the data type of soup_jobs
type(soup_jobs)

bs4.element.Tag

In [17]:
# Find all job listing
job_cards = soup_jobs.find_all('div', class_='jobsearch-SerpJobCard')

# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

**Quick Note**: job_cards is an iterator

In [18]:
# How many jobs listed in the 1st page? 
len(job_cards)

13

In [19]:
# Print the job titles

titles = []

for job in job_cards:
    title = job.find('h2', class_='title')
    title = title.text.strip()
    titles.append(title)

titles

['Data Scientist',
 'Data Scientist',
 'Senior Statistical Programmer',
 'Data Scientist\nnew',
 'Sports Statistician\nnew',
 'Data Scientist\nnew',
 'Data Scientist - 100% Remote Available\nnew',
 'Sr Data Scientist (Big Data)\nnew',
 'Data Scientist',
 'Data Scientist - Nationwide Opportunities',
 'Decision Science Analyst I\nnew',
 'Data Scientist',
 'Senior Data Analyst']

In [20]:
# Print companies that post the jobs

companies = []

for job in job_cards:
    company = job.find('span', class_='company')
    company = company.text.strip()
    companies.append(company)

companies

['Sourceability',
 'Alaka`ina Foundation Family of Companies',
 'Translational Drug Development (TD2)',
 'Pinnacle Inc',
 'Life Time',
 'Teledyne Brown Engineering',
 'USAA',
 'Deloitte',
 'iHeartMedia, Inc.',
 'Amazon Web Services, Inc.',
 'USAA',
 'Huntington Ingalls Industries Inc.',
 'Insight Global']

In [21]:
# Print out the dates for each job post

dates = []

for job in job_cards:
    date = job.find('span', class_='date')
    date = date.text.strip()
    dates.append(date)

dates

['30+ days ago',
 '30+ days ago',
 '30+ days ago',
 'Today',
 'Today',
 '3 days ago',
 '1 day ago',
 '3 days ago',
 '30+ days ago',
 '30+ days ago',
 '1 day ago',
 '30+ days ago',
 '30+ days ago']

In [36]:
# Print out the relative links for each job post

links = []

for job in job_cards:
    link = job.find('a')['href']
    link = 'https://www.indeed.com' + link
    link = link.replace(';', '&')
    links.append(link)

links

['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0ADvRi4Fhzo6Lhixq24Efpt7RwpjKDyUQihLKB9436vvhUCqFEm-nkW0la8YnoGNEFh8i3JW6ZTXhE4DkVrso5T3Vz8yXJdq2dJLsgTANS_yKbR-JYOMuO2e5aBVxks2k6rMMP9qB20LAKyRlZvk7-84WO46AyQ2moZGjxCJmldcEsitmXLLHDrDCiKyI_Um6C17Av3wxj40vj0_JVHEhvHuJp0dbnwSNg__29E9tPghf-Y_5IW9e9AJd0-mhzFvPzcDkeMl3ZM9Ja9xyEFsihlpjS3wDqtCEECp334F-mXB43RLl86zjD3QAQWOi3CJYz3noGbL3ru8v75zxKElhvk7N8U46U5eVFd6A7sC9Eod2K-_28f9C3En0J83SiY-b16xrQf1gTQaeq8ket5b_DIWDvbp3M1F3gvWslN509rFgJzpWNkOwoygydZt5s4T6ElCnHtCsoX9ZA-DE0Z9Ue1RBprcyjr2nQ=&p=0&fvj=1&vjs=3',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0ChVmeEeaXfnw-D8ZhSBdgPTvIDVQB7yRHpNgp4If4E6BPeUXihMwd6UnesnvFhRawTBE6lMAHmma4yWjnBU0_rsltaaHuLFeKWQ_Pt3WebnFJnjdnPIvTKvYOZRuHGkGRcT9n4y2Vywlao5_EUq4WuPRcpiP881XlBl2gEZX2clHScriIuHec2fGorjjYUT5-hhf4060StLbFG1OmWmVNtUz2MnNaJynUlAwjDvqScYjDCq_wnkWpM6VBKKLwKATo5SVvy7xScrUSKTjmyzg_uz0SUA3bR24iwro_z3wDKQ0JhOroTeB8RIajzH__V1vh1pM5gjRhxlbx6SDe0CNwrea8AFEoco1JbxsHdiBDaWBC14BA7mCDCclPU3V6UeJ6YRncCa

In [45]:
# Acqurire the data from USAA job post

# Assign the url to a new variable
url_usaa = links[6]

# Make the HTTP request
page_usaa = requests.get(url_usaa)

# Check the status of the request
page_usaa.status_code

200

In [47]:
# Make a soup variable holding the response content
soup_usaa = BeautifulSoup(page_usaa.content, "html.parser")

# Get the page's title
soup_usaa.title.string

'Data Scientist - 100% Remote Available - San Antonio, TX 78288 - Indeed.com'

In [56]:
# Find the section that contains job description

job_description = soup_usaa.find('div', id="jobDescriptionText")
type(job_description)

bs4.element.Tag

In [66]:
# Print the name of the tag
job_description.name

'div'

In [67]:
# Print the attibutes of the tag
job_description.attrs

{'id': 'jobDescriptionText', 'class': ['jobsearch-jobDescriptionText']}

In [82]:
# Print the contents of the tag

print("Its data type is: ", type(job_description.contents))
job_description.contents

Its data type is:  <class 'list'>


[<div><h2 class="jobSectionHeader"><b>Purpose of Job</b></h2>
 We are currently seeking a talented Data Scientist - 100% Remote Available for the San Antonio Home Office I.<br/>
 <br/>
 Uses advanced techniques that integrate traditional and non-traditional datasets and method to enable analytical solutions; Applies predictive analytics, machine learning, simulation, and optimization techniques to generate management insights and enable customer-facing applications; participates in building analytical solutions leveraging internal and external applications to deliver value and create competitive advantage; Translates complex analytical and technical concepts to non-technical employees<p></p><h2 class="jobSectionHeader"><b>
 Job Requirements</b></h2><p><b>
 About USAA</b></p><p>
 USAA knows what it means to serve. We facilitate the financial security of millions of U.S. military members and their families. This singular mission requires a dedication to innovative thinking at every level

In [83]:
# Print the contents of the tag in one string

print("Its data type is: ", type(job_description.text))
job_description.text

Its data type is:  <class 'str'>


"Purpose of Job\nWe are currently seeking a talented Data Scientist - 100% Remote Available for the San Antonio Home Office I.\n\nUses advanced techniques that integrate traditional and non-traditional datasets and method to enable analytical solutions; Applies predictive analytics, machine learning, simulation, and optimization techniques to generate management insights and enable customer-facing applications; participates in building analytical solutions leveraging internal and external applications to deliver value and create competitive advantage; Translates complex analytical and technical concepts to non-technical employees\nJob Requirements\nAbout USAA\nUSAA knows what it means to serve. We facilitate the financial security of millions of U.S. military members and their families. This singular mission requires a dedication to innovative thinking at every level.\nIn each of the past five years, we've been a top-40 Fortune 100 Best Companies to Work For®, and we've ranked among Vi

### Build Helper Function To Acquire Job Posting Information From Indeed.com

In [74]:
def acuqire_indeed_job_description(url):
    
    # Make the HTTP request
    request = requests.get(url)
    print("Status Code: ", request.status_code)
    
    # Make a soup variable holding the response content
    soup = BeautifulSoup(request.content, "html.parser")
    
    # Print the page's title
    print(soup.title.string)
    
    # Find the section that contains job description
    description = soup.find('div', id="jobDescriptionText")
    
    return description.text

In [75]:
# Test function acuqire_indeed_job_description

url = 'https://www.indeed.com/company/PINNACLE-INC/jobs/Data-Scientist-bebdc92b14d2a19a?fccid=8619c9c3e5c7c61f&vjs=3'
acuqire_indeed_job_description(url)

Status Code:  200
Data Scientist - San Antonio, TX - Indeed.com


'Job Title:  Data ScientistLocation:  San Antonio, TXDuration:  FulltimeRole DescriptionCoordinate with different functional teams to implement models and monitor outcomes.Develop processes and tools to monitor and analyze model performance and data accuracy.Coding knowledge and experience with several languages: Python or any OO languageKnowledge and experience in statistical and data mining techniques: GLM/Regression, Random Forest, Boosting, Trees, text mining, social network analysis, etc.Experience querying databases SQL, etc.Experience creating and using advanced machine learning algorithms and statistics along with NLP: regression, simulation, scenario analysis, modeling, clustering, decision trees, neural networks, etc.Experience with distributed data/computing tools: Map/Reduce, Hadoop, Hive, Spark, Gurobi, MySQL, etc.Knowledge on model risk managementSupport documentation - model development, model testingJob Type: Full-timeWork Remotely:Temporarily due to COVID-19'

In [83]:
def acquire_indeed(job_title, location, sort_by):
    
    # Create base url
    base_url = "https://www.indeed.com/jobs?"
    
    # Create the relative url based on job title and location
    search_info = {'q': job_title, 'l': location, 'sort': sort_by}
    relative_url = urllib.parse.urlencode(search_info)
    
    # Create the full url
    url = base_url + relative_url
    
    # Make the HTTP request
    first_page = requests.get(url)
    
    # Print the status code
    print("Status Code: ", first_page.status_code)
    
    # Make a soup variable holding the response content
    soup = BeautifulSoup(first_page.content, "html.parser")
    
    # Print the page's title
    print("Page's Title: ", soup.title.string)
    
    # Find the appropriate <div> that contains all of the job listings in 1st page
    job_results = soup.find('td', id="resultsCol")
    
    # Find all job listings
    job_cards = job_results.find_all('div', class_='jobsearch-SerpJobCard')
    
    # Print the number of jobs listed in 1st page
    print("Number of jobs in this page: ", len(job_cards))
    
    # Pull the job titles
    titles = []
    for job in job_cards:
        title = job.find('h2', class_='title')
        title = title.text.strip()
        titles.append(title)
        
    # Pull the companies
    companies = []
    for job in job_cards:
        company = job.find('span', class_='company')
        company = company.text.strip()
        companies.append(company)
    
    # Pull the dates
    dates = []
    for job in job_cards:
        date = job.find('span', class_='date')
        date = date.text.strip()
        dates.append(date)
        
    # Pull the hyperlinks and job description
    links = []
    descriptions = []
    for job in tqdm(job_cards):
        link = job.find('a')['href']
        link = 'https://www.indeed.com' + link
        link = link.replace(';', '&')
        description = acuqire_indeed_job_description(link)
        links.append(link)
        descriptions.append(description)
        
    # Create a dataframe
    d = {'title': titles,
         'location': location,
         'company': companies, 
         'dates': dates, 
         'link': links, 
         'description': descriptions}
    df = pd.DataFrame(d)
    
    return df

In [84]:
# Test function acquire_indeed
return_ = acquire_indeed("data scientist", 'San Antonio, TX', 'date')

Status Code:  200
Page's Title:  Data Scientist Jobs, Employment in San Antonio, TX | Indeed.com
Number of jobs in this page:  15


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

Status Code:  200
Cloud Data Services Associate - San Antonio, TX 78206 - Indeed.com
Status Code:  200
Machine Learning and Artificial Intelligence Developer - San Antonio, TX 78232 - Indeed.com
Status Code:  200
Machine Learning Engineer - United States - Indeed.com
Status Code:  200
Data Scientist - San Antonio, TX - Indeed.com
Status Code:  200
Sports Statistician - San Antonio, TX - Indeed.com
Status Code:  200
Machine Learning Engineer - San Antonio, TX - Indeed.com
Status Code:  200
Data Scientist - 100% Remote Available - San Antonio, TX 78288 - Indeed.com
Status Code:  200
Decision Science Analyst I - San Antonio, TX 78288 - Indeed.com
Status Code:  200
Decision Science Analyst Senior (Remote Work Location Available) - San Antonio, TX 78288 - Indeed.com
Status Code:  200
Lead Financial Analyst - Artificial Intelligence Strategic Growth Offering (AI SGO) Finance & Investment - San Antonio, TX 78232 - Indeed.com
Status Code:  200
Data Scientist - San Antonio, TX - Indeed.com
Stat

In [79]:
return_

Unnamed: 0,title,location,company,dates,link,description
0,Cloud Data Services Associate\nnew,"San Antonio, TX",PwC,Today,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,A career in our Digital and Applications Desig...
1,Machine Learning and Artificial Intelligence D...,"San Antonio, TX",Deloitte,Today,https://www.indeed.com/rc/clk?jk=de18c1f025cc4...,"Are you an analytical, data-driven professiona..."
2,Machine Learning Engineer\nnew,"San Antonio, TX",Shrimpy,1 day ago,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,About UsShrimpy is the first crypto portfolio ...
3,Data Scientist\nnew,"San Antonio, TX",Pinnacle Inc,1 day ago,https://www.indeed.com/company/PINNACLE-INC/jo...,Job Title: Data ScientistLocation: San Anton...
4,Sports Statistician\nnew,"San Antonio, TX",Life Time,1 day ago,https://www.indeed.com/rc/clk?jk=3a224d3d3228b...,Position Summary\nThe Statistician assists in ...
5,Machine Learning Engineer\nnew,"San Antonio, TX",Pinnacle Inc,1 day ago,https://www.indeed.com/company/PINNACLE-INC/jo...,Job Title: Machine Learning EngineerLocation:...
6,Data Scientist - 100% Remote Available\nnew,"San Antonio, TX",USAA,2 days ago,https://www.indeed.com/rc/clk?jk=48c1334e45d90...,Purpose of Job\nWe are currently seeking a tal...
7,Decision Science Analyst I\nnew,"San Antonio, TX",USAA,2 days ago,https://www.indeed.com/rc/clk?jk=f5777dc3d4899...,Purpose of Job\nWe are currently seeking a tal...
8,Decision Science Analyst Senior (Remote Work L...,"San Antonio, TX",USAA,2 days ago,https://www.indeed.com/rc/clk?jk=63636e1b28331...,Purpose of Job\nWe are currently seeking a tal...
9,Lead Financial Analyst - Artificial Intelligen...,"San Antonio, TX",Deloitte,2 days ago,https://www.indeed.com/rc/clk?jk=dd0f7e4f169a7...,Lead Financial Analyst-Artificial Intelligence...


In [88]:
return_.description[7]

"Purpose of Job\nWe are currently seeking a talented Decision Science Analyst I for one of the following locations: San Antonio Home Office I, Colorado Springs Campus, Phoenix Campus, Tampa Crosstown, Tampa Campus, Chesapeake or possibly Remote.\n\nDecision Science Analyst I\n\nCommunity: Data & Analytics\n\nJobs that are part of a community have unique requirements established to maintain consistent application, usage and reporting structure. Please reach out to your HR Business Partner for additional information on specific requirements prior to posting and/or employee placement into this job.\n\nProvide decision support for business areas across the enterprise. Staff in this area will be responsible for applying mathematical and statistical techniques and/or innovative /quantitative analytical approaches to draw conclusions and make 'insight to action' recommendations to answer business objectives and drive change. The essence of work performed by the Decision Science Analyst involv

In [87]:
return_.link[3]

'https://www.indeed.com/company/PINNACLE-INC/jobs/Data-Scientist-bebdc92b14d2a19a?fccid=8619c9c3e5c7c61f&vjs=3'