## Demo: Web scraping with BeautifulSoup

this demo shows how to use BeautifulSoup to crawl job listing in indeed.

In [3]:
## Import the necessary packages
from bs4 import BeautifulSoup
import urllib
import re
import pandas as pd

## 1. Retrive job listing urls on one page

use indeed mobile web version since its html is simplier

In [5]:
from urllib.request import urlopen
url = "https://www.indeed.com/m/jobs?q=data+scientist&l=Los+Angeles%2C+CA"
page = urlopen(url)
soup = BeautifulSoup(page, 'lxml')

all_matches = soup.find_all('a', attrs={'rel':['nofollow']})
for i in all_matches:
    print (i['href'])
    print ("https://www.indeed.com/m/"+i['href'])

viewjob?jk=44b28b02acd2b73e
https://www.indeed.com/m/viewjob?jk=44b28b02acd2b73e
viewjob?jk=07b8b4564a3a16e5
https://www.indeed.com/m/viewjob?jk=07b8b4564a3a16e5
viewjob?jk=4d6bbce94353ef87
https://www.indeed.com/m/viewjob?jk=4d6bbce94353ef87
viewjob?jk=eb4cf8cb4ce7d255
https://www.indeed.com/m/viewjob?jk=eb4cf8cb4ce7d255
viewjob?jk=7df01a501894fa5b
https://www.indeed.com/m/viewjob?jk=7df01a501894fa5b
viewjob?jk=6fe04e20fc6b224b
https://www.indeed.com/m/viewjob?jk=6fe04e20fc6b224b
viewjob?jk=78b9e460c7b32aee
https://www.indeed.com/m/viewjob?jk=78b9e460c7b32aee
viewjob?jk=4895acf6c7f11e31
https://www.indeed.com/m/viewjob?jk=4895acf6c7f11e31
viewjob?jk=148461916cd9132d
https://www.indeed.com/m/viewjob?jk=148461916cd9132d
viewjob?jk=d8982f145f0914f3
https://www.indeed.com/m/viewjob?jk=d8982f145f0914f3


## 2. Visit each url to fetch the title, company, location and job description for each job listing

In [None]:
title = []
company = []
location = []
jd = []
for each in all_matches:
    jd_url= 'http://www.indeed.com/m/'+each['href']
    jd_page = urlopen(jd_url)
    jd_soup = BeautifulSoup(jd_page, 'lxml')
    jd_desc = jd_soup.findAll('div',attrs={'id':['desc']}) ## find the structure like: <div id="desc"></>
    title.append(jd_soup.body.p.b.font.text)
    company.append(jd_desc[0].span.text)
    location.append(jd_soup.body.p.span.text)
    jd.append(jd_desc[0].text)
    # break # Test on one position

In [7]:
location

['Los Angeles, CA 90032']

In [None]:
## Job Description
print(jd_desc[0].text)

In [8]:
## Job Title 
print(jd_soup.body.p.b.font.text)
print(title)

DATA SCIENTIST
['DATA SCIENTIST']


In [9]:
## Company Name
print(jd_desc[0].span.text)
print(jd_soup.body.p.span.previous_sibling.split('-')[0][1:])

California State University
California State University 


In [10]:
title

['DATA SCIENTIST']

#### Save the data into Data Frame

In [13]:
job = {'title': title,
         'company': company,
         'location': location,
         'Job Description': jd}
df = pd.DataFrame.from_dict(job)

In [14]:
df

Unnamed: 0,title,company,location,Job Description
0,DATA SCIENTIST,California State University,"Los Angeles, CA 90032",Campus:\nLos Angeles\n\nJob ID: 180220\n\nJob ...
1,Data Scientist Consultant,12 days ago,"Los Angeles, CA",Consultant - Data Scientist ArchitectWe seek d...
2,Data Scientist intern.,1 day ago,"Beverly Hills, CA",findSisterhood is looking for new full stack d...
3,Cannabis Tissue Culture Technician,9 days ago,"Los Angeles, CA 90021",Cannabis Tissue Culture TechnicianJob Summary:...
4,Molecular Genetics Tech,Kaiser Permanente,"Los Angeles, CA 90039",Under immediate supervision work rotations are...
5,Data Scientist/Visualization Master,InterMedia Advertising,"Los Angeles, CA 90067",We are seeking a Manager of Business Intellige...
6,Data Scientist,Meredith Corporation,"Los Angeles, CA",Job Title\nData Scientist\nJob Description\nBi...
7,Junior Data Scientist,Ranker,"Los Angeles, CA 90048",We're looking for a Junior Data Scientist to h...
8,Data Scientist,Amazon.com,"Santa Monica, CA",A Bachelor or Masters Degree in a highly quant...
9,Data Scientist,Sony Pictures Entertainment Inc.,"Culver City, CA 90232",Data Scientist\nWhat will be your mission?\nAt...


If we don't break the loop above, we can crawl all the job information from one page.

## 3. Go to next pages

In [15]:
title = []
company = []
location = []
jd = []
url = "https://www.indeed.com/m/jobs?q=data+scientist&l=Los+Angeles%2C+CA"
for i in range(2):
    
    page = urlopen(url)
    soup = BeautifulSoup(page, 'lxml')
    all_matches = soup.findAll(attrs={'rel':['nofollow']})
    for each in all_matches:
        jd_url= 'http://www.indeed.com/m/'+each['href']
        jd_page =urlopen(jd_url)
        jd_soup = BeautifulSoup(jd_page, 'lxml')
        jd_desc = jd_soup.findAll(attrs={'id':['desc']})
        title.append(jd_soup.body.p.b.font.text)
        company.append(jd_desc[0].span.text)
        location.append(jd_soup.body.p.span.text)
        jd.append(jd_desc[0].text)
        
    ## Change the pages to Next Page
    url_all = soup.findAll(attrs={'rel':['next']})
    url = 'http://www.indeed.com/m/'+ str(url_all[0]['href'])


In [16]:
job = {'title': title,
         'company': company,
         'location': location,
         'Job Description': jd}
df = pd.DataFrame.from_dict(job)

In [17]:
df

Unnamed: 0,title,company,location,Job Description
0,Data Scientist,Amazon.com,"Santa Monica, CA",A Bachelor or Masters Degree in a highly quant...
1,Data Scientist Consultant,12 days ago,"Los Angeles, CA",Consultant - Data Scientist ArchitectWe seek d...
2,Summer 2019 Internship-Machine Learning,Automatic Data Processing,"Pasadena, CA 91101",ADP is hiring an intern. In this position you ...
3,DATA SCIENTIST,California State University,"Los Angeles, CA 90032",Campus:\nLos Angeles\n\nJob ID: 180220\n\nJob ...
4,Entry Level Big Data Engineer,AT&T,"El Segundo, CA 90245",This role will focus on Big Data while providi...
5,"Full Stack Developer, Data Science & Insights",Netflix,"Los Angeles, CA","Los Angeles, California\nData Science and Engi..."
6,"Analyst, U.S. TV Research",Sony Pictures Entertainment Inc.,"Culver City, CA 90232","Analyst, U.S. TV Research\nThe Analyst, US Res..."
7,Data Scientist/Visualization Master,InterMedia Advertising,"Los Angeles, CA 90067",We are seeking a Manager of Business Intellige...
8,Data Scientist intern.,1 day ago,"Beverly Hills, CA",findSisterhood is looking for new full stack d...
9,Data Insight Analyst,Northrop Grumman,"Redondo Beach, CA 90278","At Northrop Grumman, our employees have incred..."
