### Import packages

In [132]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd

### Get webpage

In [137]:
# get webpage
def simple_get(url):
    
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_resp(resp):
                return resp.content
            else:
                return None
            
    except RequestException as e:
        log_error(f'Error during requsts to {url}: {str(e)}')
        return None
    
def is_good_resp(resp):
    
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

def log_error(e):
    
    print(e)
    

In [116]:
# search parameters Data Scientist jobs in Greater Los Angeles
raw_html = simple_get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start=75')
len(raw_html)

30899

### Parse and filter using BeautifulSoup

In [92]:
# parse raw html
html1 = BeautifulSoup(raw_html, 'html.parser')
#print(html.prettify())

In [134]:
# function to gather data for each job
def get_urls(num_of_jobs):
    list_no = 0
    
    while list_no < num_of_jobs:
        url = f'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start={list_no}'
        print(url)
        list_no += 25
        
#get_urls(100)

def sieve_page(html):
    
    # find all job listing titles
    job_list = html.find_all(class_='result-card job-result-card result-card--with-hover-state')
    #print(job_list)

    page_list = []
    
    if len(job_list) > 0:
        for job in job_list:
            title = job.find('h3').text
            company = job.find('h4').text
            location = job.find(class_='job-result-card__location').text
            description = job.find('p').text
            link = job.find('a').get('href')

            sieved_job = [title, company, location, description, link]
            page_list.append(sieved_job)

        print(len(page_list))
           
        return page_list
    else:
        return None

#sieve_page(html1)

### Scrape desired number of jobs

In [None]:
# let's combine the functions to do all processes in one fell swoop
# takes the total number of jobs to scrape as the argument
def sieve_all_pages(desired_num_of_jobs):
    list_pos = 0
    sieved_list = []
    
    while list_pos < desired_num_of_jobs:
        
        # while the position is less than number of jobs I want to scrape go to next page
        url = f'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start={list_pos}'
        
        # get page
        raw_html = simple_get(url)
        
        # parse using BeautifulSoup
        html = BeautifulSoup(raw_html, 'html.parser')
        
        # use sieve_page function to gather relevant data and add to main list
        new_list = sieve_page(html)
        
        if len(new_list) > 0:
            sieved_list += new_list
        else:
            break
        
        # change to new position
        list_pos += 25

    return sieved_list

my_jobs = sieve_all_pages(1000)

25
25
25
17
25
14
25
17
25
22
25
24
25
21
25
16
25
24
25
25


In [136]:
df = pd.DataFrame(my_jobs, columns=['title', 'company', 'location', 'description', 'link'])
df.head()


Unnamed: 0,title,company,location,description,link
0,Senior Data Scientist,Jobspring Partners,"Los Angeles, CA, US",What You Will Be Doing. What we are really loo...,https://www.linkedin.com/jobs/view/senior-data...
1,Data Scientist,Jobot,"Irvine, CA, US",What can we do for you? We are looking for…. T...,https://www.linkedin.com/jobs/view/data-scient...
2,"Data Scientist, Analytics",NEXT Trucking,"Los Angeles, CA, US",Armed with experienced professionals from Amaz...,https://www.linkedin.com/jobs/view/data-scient...
3,Lead Data Scientist,The CSI Companies,"Costa Mesa, California",Our comprehensive and employee centric trainin...,https://www.linkedin.com/jobs/view/lead-data-s...
4,SR. Data Scientist,DISYS,"Santa Monica, California",The ideal candidate is adept at using large da...,https://www.linkedin.com/jobs/view/sr-data-sci...


In [111]:
df.shape

(83, 5)