# Scrape LinkedIn's Job Postings

In [1]:
import sys
import os
import re
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import pyautogui
import time

### Load username and password from your .env file

In [2]:
# Load environment variables
load_dotenv()
linkedin_user=os.environ['LINKEDIN_USER']
linkedin_pass=os.environ['LINKEDIN_PASS']

In [3]:
# Open browser
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get("https://www.linkedin.com")



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/WonderWolff/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache


### Login

In [4]:
# Log into LinkedIn.com
username = browser.find_element(By.ID, "session_key")
username.send_keys(linkedin_user)
password = browser.find_element(By.ID, "session_password")
password.send_keys(linkedin_pass)


In [5]:
# Login button
login_button = browser.find_element(By.CLASS_NAME, "sign-in-form__submit-button")
login_button.click()

### Begin looking for jobs

In [6]:
# Set search criteria
position = "data%20scientist"
location = "united%20states"

In [7]:
# Navigate to /jobs/
browser.get(f"https://www.linkedin.com/jobs/search/?keywords={position}&location={location}")

# Begin timer
t0 = time.time()

In [8]:
# Get number of job results
num_jobs_word = browser.find_element(By.CSS_SELECTOR, 'div>small').get_attribute('innerText')
num_jobs = int(num_jobs_word.strip(" results").replace(",",""))
num_jobs

145976

In [33]:
%%time
# Scroll through all job search results to "load" them
# This is necessary to grab their details below, as LinkedIn
# does not load all 25 at once, only once you scroll through the page
def load_full_page():
    i = 1
    while i < 25:
        element = browser.find_element(By.CLASS_NAME, "global-footer-compact")
        browser.execute_script("arguments[0].scrollIntoView();", element)
        time.sleep(0.1)

        job_lists = browser.find_element(By.CLASS_NAME, "jobs-search-results__list")
        jobs = job_lists.find_elements(By.CLASS_NAME, 'job-card-list__title')
        every_other_5_list = jobs[::i]
        for element in every_other_5_list:
            browser.execute_script("arguments[0].scrollIntoView();", element)
            time.sleep(0.1)

        i += 3
    return
load_full_page()

CPU times: user 82.2 ms, sys: 12.3 ms, total: 94.5 ms
Wall time: 10.9 s


### Get company name per job posting

In [34]:
# Get company name
company_names=[]
def get_company_name(browser):
    """
    Get the 25 company names listed on a LinkedIn page.
    """
    company_lists = browser.find_element(By.CLASS_NAME, "jobs-search-results__list")
    companies = company_lists.find_elements(By.CLASS_NAME, 'job-card-container__company-name')
    for i in companies:
        company_names.append(i.text)
    print("Company Names:")
    print(company_names, "\n")
    print(len(company_names))
    return 
get_company_name(browser)

Company Names:
['Western Governors University', 'Live Nation Entertainment', 'The Walt Disney Company', 'Carvana', 'Live Nation Entertainment', 'Deloitte', 'Criteria Corp', 'Meta', 'KPMG US', 'Roblox', 'ManTech', 'Zoom', 'Starbucks', 'Niantic, Inc.', 'Microsoft', 'Apple', 'Southern California Edison (SCE)', 'Autodesk', 'Deloitte', 'Zoom', 'Zoom', 'Khan Academy', 'National Research Group', 'The Guitar Center Company', 'Ropes & Gray LLP'] 

25


### Get job title

In [35]:
# Get job titles
job_title=[]
def get_job_titles(browser):
    """
    Get the 25 company names listed on a LinkedIn page.
    """
    job_lists = browser.find_element(By.CLASS_NAME, "jobs-search-results__list")
    jobs = job_lists.find_elements(By.CLASS_NAME, 'job-card-list__title')
    for i in jobs:
        job_title.append(i.text)
    print("Job Titles:")
    print(job_title, "\n")
    print(len(job_title))
    return
get_job_titles(browser)

Job Titles:
['Data Scientist-2', 'Data Scientist - Marketing', 'Senior Data Scientist', 'Senior Data Scientist, NLP / Conversational AI', 'Data Scientist - Recommendations', 'AI Data Scientist - TS Required', 'Data Scientist', 'Data Scientist, Product Analytics - VR Devices (FRL)', 'Senior Associate, Data Scientist', 'Senior Data Scientist - Collaborative Development', 'Senior Data Scientist - Cloud', 'Senior Data Scientist, Product Intelligence', 'data scientist - People Analytics', 'Data Scientist, Game Analytics', 'Data & Applied Scientist Manager', 'Sr. Data Scientist for Product Marketing and Customer Analytics', 'Data Scientist Advisor [HYBRID]', 'Senior Data Scientist / Machine Learning Engineer, eCommerce', 'AI Data Scientist - TS Required', 'Senior Data Scientist, Product Intelligence', 'Senior Data Scientist, Product Intelligence', 'Senior Data Scientist/Analyst, District Success', 'Data Scientist', 'Data Scientist II', 'Data Scientist'] 

25


### Get location of job

In [36]:
# Get locations
location=[]
def get_location(browser):
    """
    Get the 25 job locations listed on a LinkedIn page.
    """
    # Get list of job postings
    location_lists = browser.find_element(By.CLASS_NAME, "jobs-search-results__list")

    # Get each individual job posting
    each_item = location_lists.find_elements(By.CLASS_NAME, 'jobs-search-results__list-item')

    for item in each_item:
        # For each job posting, grab the first element containing
        # the class name below, which will give us location
        i = item.find_element(By.CLASS_NAME, 'job-card-container__metadata-wrapper')
        location.append(i.text)
    print("Location:")
    print(location, "\n")
    print(len(location))
    return
get_location(browser)

Location:
['Los Angeles, CA', 'West Hollywood, CA\nOn-site', 'Burbank, CA', 'Los Angeles Metropolitan Area\nOn-site', 'West Hollywood, CA\nOn-site', 'Alexandria, VA', 'West Hollywood, CA\nRemote', 'Los Angeles, CA', 'Los Angeles, CA', 'San Mateo, CA\nOn-site', 'Chantilly, VA\nOn-site', 'Arizona, United States\nRemote', 'Seattle, WA', 'Los Angeles, CA\nOn-site', 'Washington, DC', 'Cupertino, CA\nOn-site', 'Pomona, CA', 'Los Angeles, CA', 'Arlington, VA', 'Atlanta, GA\nRemote', 'Massachusetts, United States\nRemote', 'San Francisco Bay Area\nRemote', 'Los Angeles Metropolitan Area\nHybrid', 'Westlake Village, CA', 'Greater Syracuse-Auburn Area'] 

25


### Get job description

In [37]:
# Get job descriptions
description=[]
def get_descriptions(browser):
    """
    Get the 25 job descriptions listed on a LinkedIn page.
    """
    # Get job description
    description_lists = browser.find_element(By.CLASS_NAME, "jobs-search-results__list")
    job_descriptions = description_lists.find_elements(By.CLASS_NAME, 'jobs-search-results__list-item')
    for i in job_descriptions:
        i.click()
        element = browser.find_element(By.CLASS_NAME, 'jobs-description__content')
        description.append(element.get_attribute("innerText"))
    print("Description:")
    print(description, "\n")
    print(len(description))
    return
get_descriptions(browser)

Description:
["If you’re passionate about building a better future for individuals, communities, and our country—and you’re committed to working hard to play your part in building that future—consider WGU as the next step in your career.\n\nDriven by a mission to expand access to higher education through online, competency-based degree programs, WGU is also committed to being a great place to work for a diverse workforce of student-focused professionals. The university has pioneered a new way to learn in the 21st century, one that has received praise from academic, industry, government, and media leaders. Whatever your role, working for WGU gives you a part to play in helping students graduate, creating a better tomorrow for themselves and their families.\n\nThe principle function of the Data Scientist is to extract meaningful information from digital data of all types. At Western Governors University, the Data Scientist is responsible for synthesizing meaningful and actionable informa

In [38]:
# Create dataframe from results
linkedin_jobs = pd.DataFrame([company_names, job_title, location, description], 
                             index=["company","job_title","location","description"]).T
linkedin_jobs

Unnamed: 0,company,job_title,location,description
0,Western Governors University,Data Scientist-2,"Los Angeles, CA",If you’re passionate about building a better f...
1,Live Nation Entertainment,Data Scientist - Marketing,"West Hollywood, CA\nOn-site",Job Summary:\n\nLOCATION: Los Angeles / West H...
2,The Walt Disney Company,Senior Data Scientist,"Burbank, CA",We are looking for a Senior Data Scientist to ...
3,Carvana,"Senior Data Scientist, NLP / Conversational AI",Los Angeles Metropolitan Area\nOn-site,About Carvana\n\nIf you like disrupting the no...
4,Live Nation Entertainment,Data Scientist - Recommendations,"West Hollywood, CA\nOn-site",Job Summary:\n\nWHO ARE WE?\n\nLive Nation Ent...
5,Deloitte,AI Data Scientist - TS Required,"Alexandria, VA","In this age of disruption, organizations need ..."
6,Criteria Corp,Data Scientist,"West Hollywood, CA\nRemote",Criteria is a technology company that’s changi...
7,Meta,"Data Scientist, Product Analytics - VR Devices...","Los Angeles, CA",The VR Devices Data Science team’s mission is ...
8,KPMG US,"Senior Associate, Data Scientist","Los Angeles, CA",Requisition Number: 82315 - 8\n\nDescription\n...
9,Roblox,Senior Data Scientist - Collaborative Development,"San Mateo, CA\nOn-site","Every day, tens of millions of people from aro..."


In [39]:
# End timer
t1 = time.time()

# How long did this process take for one page?
total = t1-t0
print("This process took:")
print("In Seconds:", total)
print("In Minutes:", total/60)

This process took:
In Seconds: 1857.6300570964813
In Minutes: 30.960500951608022


In [41]:
### Current problem: how can we click on each job posting without loading the page?
### It happens randomly

In [40]:
# # Repeat for all 40 pages
page = 2
for i in range(2, 6):
    page = i
    browser.get(f'https://www.linkedin.com/jobs/search/?keywords={position}&location={location}&start={page}')
    time.sleep(0.5)
    load_full_page()
    get_company_name(browser)
    get_job_titles(browser)
    get_location(browser)
    get_descriptions(browser)

Company Names:
['Western Governors University', 'Live Nation Entertainment', 'The Walt Disney Company', 'Carvana', 'Live Nation Entertainment', 'Deloitte', 'Criteria Corp', 'Meta', 'KPMG US', 'Roblox', 'ManTech', 'Zoom', 'Starbucks', 'Niantic, Inc.', 'Microsoft', 'Apple', 'Southern California Edison (SCE)', 'Autodesk', 'Deloitte', 'Zoom', 'Zoom', 'Khan Academy', 'National Research Group', 'The Guitar Center Company', 'Ropes & Gray LLP', 'The Guitar Center Company', 'Edgecast', 'Live Nation Entertainment', 'Oculus VR', 'Cart.com', "Children's Hospital Los Angeles (CHLA)", 'KPMG US', 'Reddit, Inc.', 'Activision', 'Coinbase', 'Yahoo', 'Amgen', 'Deloitte', 'Insight Global', 'Degreed', 'HelpSystems', 'Microsoft', 'Amazon Web Services (AWS)', 'KPMG US', 'Reddit, Inc.', 'Live Nation Entertainment', 'Western Governors University', 'Toptal', 'ChowNow', 'Apple'] 

50
Job Titles:
['Data Scientist-2', 'Data Scientist - Marketing', 'Senior Data Scientist', 'Senior Data Scientist, NLP / Conversation

Company Names:
['Western Governors University', 'Live Nation Entertainment', 'The Walt Disney Company', 'Carvana', 'Live Nation Entertainment', 'Deloitte', 'Criteria Corp', 'Meta', 'KPMG US', 'Roblox', 'ManTech', 'Zoom', 'Starbucks', 'Niantic, Inc.', 'Microsoft', 'Apple', 'Southern California Edison (SCE)', 'Autodesk', 'Deloitte', 'Zoom', 'Zoom', 'Khan Academy', 'National Research Group', 'The Guitar Center Company', 'Ropes & Gray LLP', 'The Guitar Center Company', 'Edgecast', 'Live Nation Entertainment', 'Oculus VR', 'Cart.com', "Children's Hospital Los Angeles (CHLA)", 'KPMG US', 'Reddit, Inc.', 'Activision', 'Coinbase', 'Yahoo', 'Amgen', 'Deloitte', 'Insight Global', 'Degreed', 'HelpSystems', 'Microsoft', 'Amazon Web Services (AWS)', 'KPMG US', 'Reddit, Inc.', 'Live Nation Entertainment', 'Western Governors University', 'Toptal', 'ChowNow', 'Apple', 'Reddit, Inc.', 'Live Nation Entertainment', 'Twitch', 'Insight Global', 'Yahoo', 'CoreLogic', 'Degreed', 'Microsoft', 'Amazon Web Serv

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=99.0.4844.51)
Stacktrace:
0   chromedriver                        0x000000010cdfd159 chromedriver + 5120345
1   chromedriver                        0x000000010cd8ab13 chromedriver + 4651795
2   chromedriver                        0x000000010c97ae68 chromedriver + 392808
3   chromedriver                        0x000000010c97df17 chromedriver + 405271
4   chromedriver                        0x000000010c97ddc1 chromedriver + 404929
5   chromedriver                        0x000000010c97e97d chromedriver + 407933
6   chromedriver                        0x000000010c9e1393 chromedriver + 811923
7   chromedriver                        0x000000010c9cdb42 chromedriver + 731970
8   chromedriver                        0x000000010c9e0637 chromedriver + 808503
9   chromedriver                        0x000000010c9cda33 chromedriver + 731699
10  chromedriver                        0x000000010c9a35dd chromedriver + 558557
11  chromedriver                        0x000000010c9a44f5 chromedriver + 562421
12  chromedriver                        0x000000010cdba38d chromedriver + 4846477
13  chromedriver                        0x000000010cdd421c chromedriver + 4952604
14  chromedriver                        0x000000010cdd9a12 chromedriver + 4975122
15  chromedriver                        0x000000010cdd4b4a chromedriver + 4954954
16  chromedriver                        0x000000010cdaf5b0 chromedriver + 4801968
17  chromedriver                        0x000000010cdeef78 chromedriver + 5062520
18  chromedriver                        0x000000010cdef0ff chromedriver + 5062911
19  chromedriver                        0x000000010ce04545 chromedriver + 5150021
20  libsystem_pthread.dylib             0x00007ff8051df514 _pthread_start + 125
21  libsystem_pthread.dylib             0x00007ff8051db02f thread_start + 15


In [None]:
# Create dataframe from results
linkedin_jobs = pd.DataFrame([company_names, job_title, location, description], 
                             index=["company","job_title","location","description"]).T
linkedin_jobs