In [4]:
import numpy as np
import pandas as pd

import urllib
import requests
from bs4 import BeautifulSoup

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

**Takeaways**
1. q = job title
2. l = location

In [69]:
# Create the base url
base_url = 'https://www.indeed.com/jobs?'

In [70]:
# Create the relative url for searching the data scientist jobs in San Antonio, TX

ds = {'q': 'data scientist', 'l': 'San Antonio, TX'}
ds = urllib.parse.urlencode(ds)
ds

'q=data+scientist&l=San+Antonio%2C+TX'

In [71]:
# Create the full url for searching the data scientist jobs in San Antonio, TX
url = base_url + ds

# Sanity check
url

'https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX'

In [95]:
# Make the request
page = requests.get(url)

# Sanity check: to make sure what we are looking at is indeed HTML data
page.text[:400]

'<!DOCTYPE html>\n<html lang="en" dir="ltr">\n<head>\n<meta http-equiv="content-type" content="text/html;charset=UTF-8">\n<script type="text/javascript" src="//d3fw5vlhllyvee.cloudfront.net/s/ee8d2b7/en_US.js"></script>\n<link href="//d3fw5vlhllyvee.cloudfront.net/s/64feb87/jobsearch_all.css" rel="stylesheet" type="text/css">\n<link rel="alternate" type="application/rss+xml" title="Data Scientist Jobs, E'

In [96]:
# Make a soup variable holding the response content
soup = BeautifulSoup(page.content, "html.parser")

In [97]:
# Get the page's title
soup.title.string

'Data Scientist Jobs, Employment in San Antonio, TX | Indeed.com'

In [98]:
# Find the appropriate <div> that contains all of the job listings in the 1st page
soup_jobs = soup.find(id="resultsCol")

# Print the data type of soup_jobs
type(soup_jobs)

bs4.element.Tag

In [99]:
# Find all job listing
job_cards = soup_jobs.find_all('div', class_='jobsearch-SerpJobCard')

# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

**Quick Note**: job_cards is an iterator

In [100]:
# How many jobs listed in the 1st page? 
len(job_cards)

15

In [101]:
# Print the job titles

titles = []

for job in job_cards:
    title = job.find('h2', class_='title')
    title = title.text.strip()
    titles.append(title)

titles

['Data Scientist - 100% Remote Available\nnew',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist\nnew',
 'Sports Statistician\nnew',
 'Senior Data Analyst',
 'Data Scientist\nnew',
 'Sr Data Scientist (Big Data)\nnew',
 'Data Scientist',
 'Decision Science Analyst I\nnew',
 'Data Scientist - Nationwide Opportunities',
 'Data Scientist - RWE (Ref: RM)',
 'Data Scientist',
 'Data Engineer',
 'Data Architect/Scientist']

In [102]:
# Print companies that post the jobs

companies = []

for job in job_cards:
    company = job.find('span', class_='company')
    company = company.text.strip()
    companies.append(company)

companies

['USAA',
 'Alaka`ina Foundation Family of Companies',
 'Sourceability',
 'Pinnacle Inc',
 'Life Time',
 'Insight Global',
 'Teledyne Brown Engineering',
 'Deloitte',
 'iHeartMedia, Inc.',
 'USAA',
 'Amazon Web Services, Inc.',
 'PHASTAR',
 'Huntington Ingalls Industries Inc.',
 'Verizon',
 'Wind River']

In [106]:
# Print out the relative links for each job post

dates = []

for job in job_cards:
    date = job.find('span', class_='date')
    date = date.text.strip()
    dates.append(date)

dates

['1 day ago',
 '30+ days ago',
 '30+ days ago',
 'Today',
 'Today',
 '30+ days ago',
 '3 days ago',
 '3 days ago',
 '30+ days ago',
 '1 day ago',
 '30+ days ago',
 '27 days ago',
 '30+ days ago',
 '28 days ago',
 '30+ days ago']