In [1]:
import numpy as np
import pandas as pd

import urllib
import requests
from bs4 import BeautifulSoup

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

**Takeaways**
1. q = job title
2. l = location

In [2]:
# Create the base url
base_url = 'https://www.indeed.com/jobs?'

In [3]:
# Create the relative url for searching the data scientist jobs in San Antonio, TX

ds = {'q': 'data scientist', 'l': 'San Antonio, TX'}
ds = urllib.parse.urlencode(ds)
ds

'q=data+scientist&l=San+Antonio%2C+TX'

In [4]:
# Create the full url for searching the data scientist jobs in San Antonio, TX
url = base_url + ds

# Sanity check
url

'https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX'

In [13]:
# Make the request
page = requests.get(url)

# Sanity check: to make sure what we are looking at is indeed HTML data
page.text[:400]

'<!DOCTYPE html>\n<html lang="en" dir="ltr">\n<head>\n<meta http-equiv="content-type" content="text/html;charset=UTF-8">\n<script type="text/javascript" src="//d3fw5vlhllyvee.cloudfront.net/s/ee8d2b7/en_US.js"></script>\n<link href="//d3fw5vlhllyvee.cloudfront.net/s/64feb87/jobsearch_all.css" rel="stylesheet" type="text/css">\n<link rel="alternate" type="application/rss+xml" title="Data Scientist Jobs, E'

In [14]:
# Make a soup variable holding the response content
soup = BeautifulSoup(page.content, "html.parser")

In [15]:
# Get the page's title
soup.title.string

'Data Scientist Jobs, Employment in San Antonio, TX | Indeed.com'

In [16]:
# Find the appropriate <div> that contains all of the job listings in the 1st page
soup_jobs = soup.find(id="resultsCol")

# Print the data type of soup_jobs
type(soup_jobs)

bs4.element.Tag

In [17]:
# Find all job listing
job_cards = soup_jobs.find_all('div', class_='jobsearch-SerpJobCard')

# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

**Quick Note**: job_cards is an iterator

In [18]:
# How many jobs listed in the 1st page? 
len(job_cards)

13

In [19]:
# Print the job titles

titles = []

for job in job_cards:
    title = job.find('h2', class_='title')
    title = title.text.strip()
    titles.append(title)

titles

['Data Scientist',
 'Data Scientist',
 'Senior Statistical Programmer',
 'Data Scientist\nnew',
 'Sports Statistician\nnew',
 'Data Scientist\nnew',
 'Data Scientist - 100% Remote Available\nnew',
 'Sr Data Scientist (Big Data)\nnew',
 'Data Scientist',
 'Data Scientist - Nationwide Opportunities',
 'Decision Science Analyst I\nnew',
 'Data Scientist',
 'Senior Data Analyst']

In [20]:
# Print companies that post the jobs

companies = []

for job in job_cards:
    company = job.find('span', class_='company')
    company = company.text.strip()
    companies.append(company)

companies

['Sourceability',
 'Alaka`ina Foundation Family of Companies',
 'Translational Drug Development (TD2)',
 'Pinnacle Inc',
 'Life Time',
 'Teledyne Brown Engineering',
 'USAA',
 'Deloitte',
 'iHeartMedia, Inc.',
 'Amazon Web Services, Inc.',
 'USAA',
 'Huntington Ingalls Industries Inc.',
 'Insight Global']

In [21]:
# Print out the dates for each job post

dates = []

for job in job_cards:
    date = job.find('span', class_='date')
    date = date.text.strip()
    dates.append(date)

dates

['30+ days ago',
 '30+ days ago',
 '30+ days ago',
 'Today',
 'Today',
 '3 days ago',
 '1 day ago',
 '3 days ago',
 '30+ days ago',
 '30+ days ago',
 '1 day ago',
 '30+ days ago',
 '30+ days ago']

In [36]:
# Print out the relative links for each job post

links = []

for job in job_cards:
    link = job.find('a')['href']
    link = 'https://www.indeed.com' + link
    link = link.replace(';', '&')
    links.append(link)

links

['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0ADvRi4Fhzo6Lhixq24Efpt7RwpjKDyUQihLKB9436vvhUCqFEm-nkW0la8YnoGNEFh8i3JW6ZTXhE4DkVrso5T3Vz8yXJdq2dJLsgTANS_yKbR-JYOMuO2e5aBVxks2k6rMMP9qB20LAKyRlZvk7-84WO46AyQ2moZGjxCJmldcEsitmXLLHDrDCiKyI_Um6C17Av3wxj40vj0_JVHEhvHuJp0dbnwSNg__29E9tPghf-Y_5IW9e9AJd0-mhzFvPzcDkeMl3ZM9Ja9xyEFsihlpjS3wDqtCEECp334F-mXB43RLl86zjD3QAQWOi3CJYz3noGbL3ru8v75zxKElhvk7N8U46U5eVFd6A7sC9Eod2K-_28f9C3En0J83SiY-b16xrQf1gTQaeq8ket5b_DIWDvbp3M1F3gvWslN509rFgJzpWNkOwoygydZt5s4T6ElCnHtCsoX9ZA-DE0Z9Ue1RBprcyjr2nQ=&p=0&fvj=1&vjs=3',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0ChVmeEeaXfnw-D8ZhSBdgPTvIDVQB7yRHpNgp4If4E6BPeUXihMwd6UnesnvFhRawTBE6lMAHmma4yWjnBU0_rsltaaHuLFeKWQ_Pt3WebnFJnjdnPIvTKvYOZRuHGkGRcT9n4y2Vywlao5_EUq4WuPRcpiP881XlBl2gEZX2clHScriIuHec2fGorjjYUT5-hhf4060StLbFG1OmWmVNtUz2MnNaJynUlAwjDvqScYjDCq_wnkWpM6VBKKLwKATo5SVvy7xScrUSKTjmyzg_uz0SUA3bR24iwro_z3wDKQ0JhOroTeB8RIajzH__V1vh1pM5gjRhxlbx6SDe0CNwrea8AFEoco1JbxsHdiBDaWBC14BA7mCDCclPU3V6UeJ6YRncCa