## Tutorial

* [LinkedIn Job Scraper and Matcher](https://medium.com/@alaeddine.grine/linkedin-job-scraper-and-matcher-85d0308ef9aa)

* [How To Use BeautifulSoup's find()](https://scrapeops.io/python-web-scraping-playbook/python-beautifulsoup-find/)

Library

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time
from bs4 import BeautifulSoup
import math
import requests
import pandas as pd
import datetime
import numpy as np

Initialize a new instance of the Chrome driver, open the LinkedIn login page, enter credentials (email address and password) and click the Login button.

In [2]:
# 1. Instanciate the Chrome service
chromedriver_path = 'chromedriver-win64/chromedriver.exe'
service = Service(executable_path=chromedriver_path)

# 2. Instanciate the webdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options, service=service)

# 3. Open the LinkedIn login page
driver.get('https://www.linkedin.com/login')
time.sleep(5) # waiting for the page to load

# 4. Enter email address & password
email_input = driver.find_element(By.ID, 'username')
password_input = driver.find_element(By.ID, 'password')
email_input.send_keys("email")
password_input.send_keys("password")

# 5. Click the login button
password_input.send_keys(Keys.ENTER)

time.sleep(10)

Scroll down to the bottom of the page to view all job postings.

In [3]:
def scroll_to_bottom(driver,sleep_time=120):
    last_height = driver.execute_script('return document.body.scrollHeight')
    while True:
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        new_height = driver.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            break
        last_height = new_height
        time.sleep(sleep_time)  

Selenium stores the source HTML in the driver's page_source attribute. We can load the page_source into BeautifulSoup as follows:

In [4]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

scraping Job IDs

In [21]:
job_title = "datascientist"
location = "United States"

In [22]:
soup = BeautifulSoup(driver.page_source, 'html.parser')
List_Job_IDs = []

# 1. Get number of jobs found and number of pages:
try:
    div_number_of_jobs = soup.find("div",{"class":"jobs-search-results-list__subtitle"})
    number_of_jobs = int(div_number_of_jobs.find('span').get_text().strip().split()[0].replace(',', ''))
except:
    number_of_jobs = 0
    
number_of_pages=math.ceil(number_of_jobs/25)
print("number_of_jobs:",number_of_jobs)
print("number_of_pages:",number_of_pages)

# 2. Function to find Job Ids:
def find_Job_Ids(soup):

    Job_Ids_on_the_page = []
    
    job_postings = soup.find_all('li', {'class': 'jobs-search-results__list-item'})
    for job_posting in job_postings:
        Job_ID = job_posting.get('data-occludable-job-id')
        Job_Ids_on_the_page.append(Job_ID)
    
    return Job_Ids_on_the_page  
  
 # 3. Get Job IDs that are on the first page:
Jobs_on_1st_page = find_Job_Ids(soup)
List_Job_IDs.extend(Jobs_on_1st_page)

# 4. Iterate over the remaining pages:
if number_of_pages>1:
    
    for page_num in range(1,number_of_pages):
        print(f"Scraping page: {page_num}",end="...")
        
        url = f'https://www.linkedin.com/jobs/search/?keywords={job_title}&location={location}&start={25 * page_num}'
        url = requests.utils.requote_uri(url)
        driver.get(url)
        scroll_to_bottom(driver)

        # Parse the HTML content of the page using BeautifulSoup.
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Get Job Ids present on the page.
        Jobs_on_this_page = find_Job_Ids(soup)
        List_Job_IDs.extend(Jobs_on_this_page)

number_of_jobs: 3143
number_of_pages: 126
Scraping page: 1...Scraping page: 2...Scraping page: 3...Scraping page: 4...Scraping page: 5...Scraping page: 6...Scraping page: 7...Scraping page: 8...Scraping page: 9...Scraping page: 10...Scraping page: 11...Scraping page: 12...Scraping page: 13...Scraping page: 14...Scraping page: 15...Scraping page: 16...Scraping page: 17...Scraping page: 18...Scraping page: 19...Scraping page: 20...Scraping page: 21...Scraping page: 22...Scraping page: 23...Scraping page: 24...Scraping page: 25...Scraping page: 26...Scraping page: 27...Scraping page: 28...Scraping page: 29...Scraping page: 30...Scraping page: 31...Scraping page: 32...Scraping page: 33...Scraping page: 34...Scraping page: 35...Scraping page: 36...Scraping page: 37...Scraping page: 38...Scraping page: 39...Scraping page: 40...Scraping page: 41...Scraping page: 42...Scraping page: 43...Scraping page: 44...Scraping page: 45...Scraping page: 46...Scraping page: 47...Scraping page: 48...Scrapin

In [23]:
job_Id = pd.DataFrame({"Job_Id":List_Job_IDs})

 retrieve job details such as seniority level and job descriptions by sending a simple GET request from the Requests library to this URL: job_url='https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_Id}'

In [25]:
def remove_tags(html):
    '''remove html tags from BeautifulSoup.text'''
 
    soup = BeautifulSoup(html, "html.parser")
 
    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()
 
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)
  
job_url='https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}'
job={}
list_jobs=[]

for j in range(0,len(List_Job_IDs)):
    print(f"{j+1} ... read jobId:{List_Job_IDs[j]}")

    resp = requests.get(job_url.format(List_Job_IDs[j]))
    soup=BeautifulSoup(resp.text,'html.parser')
    # print(soup.prettify()) 

    job["Job_ID"] = List_Job_IDs[j] 
    
    try: 
        job["Job_txt"] = remove_tags(resp.content)
    except:
        job["Job_txt"] = None
    
    try:
        job["company"]=soup.find("div",{"class":"top-card-layout__card"}).find("a").find("img").get('alt')
    except:
        job["company"]=None

    try:
        job["job-title"]=soup.find("div",{"class":"top-card-layout__entity-info"}).find("a").text.strip()
    except:
        job["job-title"]=None

    try:
        job["level"]=soup.find("ul",{"class":"description__job-criteria-list"}).find("li").text.replace("Seniority level","").strip()
    except:
        job["level"]=None

    try:
        job["location"]=soup.find("span",{"class":"topcard__flavor topcard__flavor--bullet"}).text.strip()
    except:
        job["location"]=None

    try:
        job["posted-time-ago"]=soup.find("span",{"class":"posted-time-ago__text topcard__flavor--metadata"}).text.strip()
    except:
        job["posted-time-ago"]=None

    try:
        nb_candidats = soup.find("span",{"class":"num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet"}).text.strip()
        nb_candidats = int(nb_candidats.split()[0])
        job["nb_candidats"]= nb_candidats
    except:
        job["nb_candidats"]=None

    list_jobs.append(job)
    job={}

# create a pandas Datadrame
jobs_df = pd.DataFrame(list_jobs)

1 ... read jobId:3942122113
2 ... read jobId:3950387116
3 ... read jobId:3954624652
4 ... read jobId:3952865091
5 ... read jobId:3868373686
6 ... read jobId:3963002079
7 ... read jobId:3956381898
8 ... read jobId:3960243065
9 ... read jobId:3934205005
10 ... read jobId:3955970131
11 ... read jobId:3958445657
12 ... read jobId:3956370183
13 ... read jobId:3909181155
14 ... read jobId:3957238965
15 ... read jobId:3950507075
16 ... read jobId:3945569122
17 ... read jobId:3954427145
18 ... read jobId:3914842822
19 ... read jobId:3947886685
20 ... read jobId:3961656570
21 ... read jobId:3945519436
22 ... read jobId:3945844555
23 ... read jobId:3938910226
24 ... read jobId:3952496801
25 ... read jobId:3953294603
26 ... read jobId:3946606287
27 ... read jobId:3907620477
28 ... read jobId:3889623739
29 ... read jobId:3962071834
30 ... read jobId:3961674741
31 ... read jobId:3907947680
32 ... read jobId:3905561130
33 ... read jobId:3934489924
34 ... read jobId:3959283433
35 ... read jobId:39524

In [26]:
jobs_df

Unnamed: 0,Job_ID,Job_txt,company,job-title,level,location,posted-time-ago,nb_candidats
0,3942122113,Data Scientist Harmony Public Schools Greater ...,Harmony Public Schools,Data Scientist,Not Applicable,Greater Houston,3 weeks ago,
1,3950387116,Data Science Specialist Oakridge Staffing Stam...,Oakridge Staffing,Data Science Specialist,Associate,"Stamford, CT",2 weeks ago,
2,3954624652,Optimization Data Scientist Bayforce United St...,Bayforce,Optimization Data Scientist,Mid-Senior level,United States,1 week ago,
3,3952865091,"Data Scientist Ansell Iselin, NJ 1 week ago Ov...",Ansell,Data Scientist,Associate,"Iselin, NJ",1 week ago,
4,3868373686,Data Scientist I / Data Scientist II / Senior ...,PPL Corporation,Data Scientist I / Data Scientist II / Senior ...,Entry level,"Allentown, PA",1 week ago,
...,...,...,...,...,...,...,...,...
1675,3905962146,,,,,,,
1676,3910641899,,,,,,,
1677,3910647428,,,,,,,
1678,3911614655,,,,,,,


In [None]:
def get_posted_date(posted_time_ago,date_scraping):
    """Convert posted_time_ago to number of days.
    For example, 1 month ago is replaced by 30. 1 week by 7 and so on..."""
    posted_date = None
    
    try:
        details = posted_time_ago.split()
        N_DAYS_AGO = int(details[0])
        day_week_month_year = details[1] 
        if day_week_month_year.startswith("day"):
            N_DAYS_AGO = N_DAYS_AGO
        elif day_week_month_year.startswith("week"):
            N_DAYS_AGO = N_DAYS_AGO*7
        elif day_week_month_year.startswith("month"):
            N_DAYS_AGO = N_DAYS_AGO*30
        elif day_week_month_year.startswith("year"):
            N_DAYS_AGO = N_DAYS_AGO*365
        else:
            N_DAYS_AGO = None

        posted_date = date_scraping - datetime.timedelta(days=N_DAYS_AGO)
    except:
        posted_date = None

    return posted_date

In [None]:
jobs_df['scraping_date'] = pd.to_datetime(datetime.date.today())
jobs_df['posted_date'] = np.vectorize(get_posted_date)(jobs_df['posted-time-ago'], jobs_df['scraping_date'])
jobs_df.level = jobs_df.level.apply(lambda x:x.replace("Employment type\n        \n\n          ","") if x is not None else x)

In [28]:
jobs_df.to_csv('jobs_df.csv', index=False)