### Import packages

In [162]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import pandas_profiling
import time

### Get webpage

In [137]:
# get webpage
def simple_get(url):
    
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_resp(resp):
                return resp.content
            else:
                return None
            
    except RequestException as e:
        log_error(f'Error during requsts to {url}: {str(e)}')
        return None
    
def is_good_resp(resp):
    
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

def log_error(e):
    
    print(e)
    

In [116]:
# search parameters Data Scientist jobs in Greater Los Angeles
raw_html = simple_get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start=75')
len(raw_html)

30899

### Parse and filter using BeautifulSoup

In [92]:
# parse raw html
html1 = BeautifulSoup(raw_html, 'html.parser')
#print(html.prettify())

In [180]:
# function to gather data for each job
def get_urls(num_of_jobs):
    list_no = 0
    
    while list_no < num_of_jobs:
        url = f'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start={list_no}'
        print(url)
        list_no += 25
        
#get_urls(100)

def get_description(url):
    details = []
    
    job_page = BeautifulSoup(simple_get(url), 'html.parser')
    level = job_page.find(class_='job-criteria__text job-criteria__text--criteria').text
    description = job_page.find(class_='description__text description__text--rich').text
    
    details = [level, description]
    
    return details

#print(get_description('https://www.linkedin.com/jobs/view/applied-data-scientist-at-hyperloop-one-1483162328?refId=6ca3af8e-e961-4edf-99fc-5a5f97ced69e&position=1&pageNum=3&trk=guest_job_search_job-result-card_result-card_full-click'))

# keep count
COUNT = 0

def sieve_page(html):
    
    # find all job listing titles
    job_list = html.find_all(class_='result-card job-result-card result-card--with-hover-state')
    #print(job_list)

    page_list = []
    
    if len(job_list) > 0:
        for job in job_list:
            global COUNT
            COUNT += 1
            print(f'Scraping job #{COUNT}\r', end='')
            
            title = job.find('h3').text
            company = job.find('h4').text
            location = job.find(class_='job-result-card__location').text
            link = job.find(class_='result-card__full-card-link').get('href')
            description = get_description(link)
            
            sieved_job = [title, company, location, link] + description
            page_list.append(sieved_job)
            
        #print(len(page_list))
        
        return page_list
    else:
        return None

#sieve_page(html1)

### Scrape desired number of jobs

In [181]:
# let's combine the functions to do all processes in one fell swoop
# takes the total number of jobs to scrape as the argument
def sieve_all_pages(desired_num_of_jobs):
    start = time.time()
    list_pos = 0
    sieved_list = []
    
    while list_pos < desired_num_of_jobs:
        
        # while the position is less than number of jobs I want to scrape go to next page
        url = f'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=data%20scientist&location=Long%20Beach%2C%20California%2C%20United%20States&trk=homepage-jobseeker_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0&start={list_pos}'
        
        # get page
        raw_html = simple_get(url)
        
        # parse using BeautifulSoup
        html = BeautifulSoup(raw_html, 'html.parser')
        
        # use sieve_page function to gather relevant data and add to main list
        new_list = sieve_page(html)
        
        if len(new_list) > 0:
            sieved_list += new_list
            count = len(sieved_list)
        else:
            print(f'It took {time.time() - start} seconds to scrape {count} jobs.')
            break
        
        # change to new position
        list_pos += 25

    
    print(f'It took {time.time() - start} seconds to scrape {count} jobs.')

    return sieved_list

my_jobs = sieve_all_pages(100)


It took 69.15081715583801 seconds to scrape 100 jobs.


### Build dataframe

In [183]:
df = pd.DataFrame(my_jobs, columns=['title', 'company', 'location', 'link', 'level', 'description'])
df.sample(5)


Unnamed: 0,title,company,location,link,level,description
74,Principal Data Scientist Job,SAP,"Newport Beach, CA, US",https://www.linkedin.com/jobs/view/principal-d...,Not Applicable,Requisition ID: 223061Work Area: Information T...
11,Senior Data Scientist,FanAI Inc.,"Santa Monica, California, United States",https://www.linkedin.com/jobs/view/senior-data...,Entry level,About FanAIFanAI (www.fanai.io) is a fan analy...
81,Lead Data Scientist,RAPP,"Los Angeles, CA, US",https://www.linkedin.com/jobs/view/lead-data-s...,Not Applicable,RAPP LA is looking for a Lead Data Scientist t...
80,"Content Data Scientist, Apple Media Products D...",Apple,"Culver City, CA, US",https://www.linkedin.com/jobs/view/content-dat...,Not Applicable,"SummaryAt Apple, new ideas have a way of becom..."
33,Senior Data Scientist,Cylance Inc.,"Irvine, CA, US",https://www.linkedin.com/jobs/view/senior-data...,Mid-Senior level,Worker Sub-TypeRegularJob DescriptionTHE POSIT...


In [184]:
df.shape

(100, 6)

In [185]:
profile = df.profile_report()

In [186]:
profile

