### Import packages

In [122]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

### Get webpage

In [3]:
# get webpage
def simple_get(url):
    
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_resp(resp):
                return resp.content
            else:
                return None
            
    except RequestException as e:
        log_error(f'Error during requsts to {url}: {str(e)}')
        pass
    
def is_good_resp(resp):
    
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

def log_error(e):
    
    print(e)
    

In [116]:
# search parameters Data Scientist jobs in Greater Los Angeles
raw_html = simple_get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start=75')
len(raw_html)

30899

### Parse and filter using BeautifulSoup

In [92]:
# parse raw html
html1 = BeautifulSoup(raw_html, 'html.parser')
#print(html.prettify())

In [119]:
# function to gather data for each job
def get_urls(num_of_jobs):
    list_no = 0
    
    while list_no < num_of_jobs:
        url = f'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start={list_no}'
        print(url)
        list_no += 25
        
get_urls(100)

def sieve_page(html):
    
    # find all job listing titles
    job_list = html.find_all(class_='result-card job-result-card result-card--with-hover-state')
    #print(job_list)

    page_list = []

    for job in job_list:
        #print(job.contents)
        title = job.find('h3').text
        company = job.find('h4').text
        location = job.find(class_='job-result-card__location').text
        description = job.find('p').text
        link = job.find('a').get('href')

        sieved_job = [title, company, location, description, link]
        page_list.append(sieved_job)
        
    return page_list

sieve_page(html1)

https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start=0
https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start=25
https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start=50
https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&page

[['Data Science Intern - Game Franchise Analytics',
  'Blizzard Entertainment',
  'Irvine, CA, US',
  'We’re seeking post-graduate students to help us analyze and build data-powered products and services for Blizzard. We’re geared up here...',
  'https://www.linkedin.com/jobs/view/data-science-intern-game-franchise-analytics-at-blizzard-entertainment-1505195821?refId=40d80008-1feb-4efa-a23b-3b6919388e3d&position=1&pageNum=3&trk=guest_job_search_job-result-card_result-card_full-click'],
 ['DATA SCIENCE MANAGER',
  'Harnham',
  'Los Angeles, CA, US',
  'Although academic experience is import, we are looking for those candidates who have industry experience post academia. The successful ...',
  'https://www.linkedin.com/jobs/view/data-science-manager-at-harnham-1509025635?refId=40d80008-1feb-4efa-a23b-3b6919388e3d&position=2&pageNum=3&trk=guest_job_search_job-result-card_result-card_full-click'],
 ['Data Scientist',
  'Pacific Life',
  'Newport Beach, CA, US',
  'Actual job offer and titl

### Scrape desired number of jobs

In [120]:
# let's combine the functions to do all processes in one fell swoop
# takes the total number of jobs to scrape as the argument
def sieve_all_pages(desired_num_of_jobs):
    list_pos = 0
    sieved_list = []
    
    while list_pos < desired_num_of_jobs:
        
        # while the position is less than number of jobs I want to scrape go to next page
        url = f'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start={list_pos}'
        
        # get page
        raw_html = simple_get(url)
        
        # parse using BeautifulSoup
        html = BeautifulSoup(raw_html, 'html.parser')
        
        # use sieve_page function to gather relevant data and add to main list
        sieved_list += sieve_page(html)
        
        # change to new position
        list_pos += 25

    return sieved_list

my_jobs = sieve_all_pages(50)

In [121]:
df = pd.DataFrame(my_jobs, columns=['title', 'company', 'location', 'description', 'link'])
df


Unnamed: 0,title,company,location,description,link
0,Senior Data Scientist,Jobspring Partners,"Los Angeles, CA, US",What You Will Be Doing. What we are really loo...,https://www.linkedin.com/jobs/view/senior-data...
1,Data Scientist,Jobot,"Irvine, CA, US",What can we do for you? We are looking for…. T...,https://www.linkedin.com/jobs/view/data-scient...
2,"Data Scientist, Analytics",NEXT Trucking,"Los Angeles, CA, US",Armed with experienced professionals from Amaz...,https://www.linkedin.com/jobs/view/data-scient...
3,Lead Data Scientist,The CSI Companies,"Costa Mesa, California",Our comprehensive and employee centric trainin...,https://www.linkedin.com/jobs/view/lead-data-s...
4,SR. Data Scientist,DISYS,"Santa Monica, California",The ideal candidate is adept at using large da...,https://www.linkedin.com/jobs/view/sr-data-sci...
5,Principal Data Scientist,Harnham,Greater Los Angeles Area,YOUR SKILLS AND EXPERIENCE. As a Principal Dat...,https://www.linkedin.com/jobs/view/principal-d...
6,Senior Data Scientist (Technical Data Science),Focus Capital Markets,"Orange County, California Area",This is a once-in-a-lifetime opportunity to jo...,https://www.linkedin.com/jobs/view/senior-data...
7,Data Scientist,"V-Soft Consulting Group, Inc.","Santa Monica, California",V-Soft Consulting is currently seeking a Data ...,https://www.linkedin.com/jobs/view/data-scient...
8,Senior Data Scientist,Cox Automotive Inc.,"Irvine, CA, US","Cox Automotive, a subsidiary of Cox Enterprise...",https://www.linkedin.com/jobs/view/senior-data...
9,Product Data Scientist - Engineering,Snap Inc.,"Los Angeles, CA, US",We’re looking for a Data Scientist to join Sna...,https://www.linkedin.com/jobs/view/product-dat...


In [111]:
df.shape

(83, 5)