### Scrape the job data of DA from linkedin and analyze

#### libs and environment setup  

In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import re



#### Function definitions

In [57]:
# Get the linkedin login page
def loginLinkedIn(browser, email, password):
  browser.get('https://www.linkedin.com/login')

  # wait 4s until all elements are loaded.
  time.sleep(4)

  # find the element ids with the Chrome inspector
  # send in username and password the corresponding input elements
  browser.find_element(By.ID, 'username').send_keys(email)
  browser.find_element(By.ID, 'password').send_keys(password)

  # send in 'submit' event
  browser.find_element(By.CLASS_NAME, 'btn__primary--large').click()
  
  return browser;

# scroll window function
def scroll(browser, timeout):
  # You can set your own pause time. My laptop is a bit slow so I use 1 sec
  scroll_pause_time = timeout 
  # get the screen height of the web
  screen_height = browser.execute_script("return window.screen.height;")   
  i = 1

  while True:
    # scroll one screen height each time
    browser.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = browser.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break

# Get the data from the search results
def get_linkedin_job_data(browser, base_url, max_results):
  job_ids = []
  counter = 0

  # traverse all required pages
  num_pages = int(max_results/25)
  for p in range(num_pages):
    if counter >= max_results:
      break;

    # url of each page
    url = base_url + '&start=' + str((p+64)*25)
    
    # get each page
    browser.get(url)
    time.sleep(4)

    # read the job brief from the list
    elements = browser.find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
    print("size of elements: ", len(elements))

    for e in elements:
      counter += 1

      if counter >= max_results:
        break;

      print(counter)

      try:
        job_id = e.get_attribute("data-occludable-job-id")
        job_ids.append(job_id)
      except:
        print("didn't find job id at ", counter)

  df = pd.DataFrame(job_ids, columns = ['job_id'])
  return df

def test_url(browser, url):
  browser.get(url)
  time.sleep(1)
  elements = browser.find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
  print("size of elements: ", len(elements))

  counter = 0
  for e in elements:
    counter += 1
    print(counter)
    print(e.get_attribute("data-occludable-job-id"))

# Get the job info from the '<li>' element of search results
def get_job_info(browser, base_url, job_id):
  # define all the required fileds
  job_title = ''           # Senior Data Analyst
  company_name = ''        # Agoda
  company_link = ''        # https://www.linkedin.com/company/agoda/life/atagoda/
  job_location = ''        # Hybrid, On-site, jobs-unified-top-card__workplace-type
  publish_date = ''        # 1 week ago, jobs-unified-top-card__posted-date
  applicant_count = ''     # jobs-unified-top-card__applicant-count
  job_type = ''            # Full-time
  job_level = ''           # Associate, Entry level
  company_size = ''        # 5,001-10,000 employees · 
  company_industry = ''    #  · Technology, Information and Internet
  connection_count = ''    # 1 connection, 2 school alumni
  connection_type = ''     # connection, alumni
  job_description = ''     # content description

  # Load the job page
  browser.get(base_url + str(job_id))
  time.sleep(1)

  # find the job title
  try:
    job_title = browser.find_element(By.TAG_NAME, 'h1').text
    #print("title: ", job_title)
  except:
    print("job_title exception occurred", job_title, job_id)

  # find the company name
  try:
    company_name_element = browser.find_element(By.CLASS_NAME, 'jobs-unified-top-card__company-name').find_element(By.TAG_NAME, 'a')
    company_name = company_name_element.text
    company_link = company_name_element.get_attribute('href')
  except:
    print("company_name exception occurred", company_name, job_id)

  # find the publish_date
  try:
    publish_date = browser.find_element(By.CLASS_NAME, 'jobs-unified-top-card__posted-date').text
    #print("publish date: ", publish_date)
  except:
    print("publish_date exception occurred", publish_date, job_id)

  # find the applicant_count
  try:
    #jobs-unified-top-card__applicant-count jobs-unified-top-card__applicant-count--low t-bold
    applicant_count = browser.find_element(By.CLASS_NAME, 'jobs-unified-top-card__applicant-count').text.split(' ',1)[0]
    #print("publish date: ", applicant_count)
  except:
    print("applicant_count exception occurred", applicant_count, job_id)

  # find all the job insights
  try:
    job_insights = browser.find_elements(By.CLASS_NAME, 'jobs-unified-top-card__job-insight')

    for insight in job_insights:
      try:
        insight_type = insight.find_element(By.TAG_NAME, 'li-icon').get_attribute('type')
        insight_content = insight.find_element(By.TAG_NAME, 'span').text
      except:
        print("insight_type exception occurred", insight_type, job_id)

      if insight_type == "job":
        contents = insight_content.strip().split('·')
        job_type = contents[0].strip()
        job_level = contents[1].strip()
      elif insight_type == "company-icon":
        contents = insight_content.strip().split('·')
        company_size = contents[0].split(' ')[0].strip()
        company_industry = contents[1].strip()
      elif insight_type == "people":
        # 1 connection / 2 school alumni
        contents = insight_content.strip().split(' ', 1)
        connection_count = contents[0].strip()
        connection_type = contents[1].strip()
      # else:
      #   print("insight_type not supported", job_id)

  except:
    print("job_insights exception occurred", applicant_count, job_id)


  # find the job_location
  try:
    job_location = browser.find_element(By.CLASS_NAME, 'jobs-unified-top-card__workplace-type').text.strip()
    #print("location: ", job_location)
  except:
    print("job_location exception occurred", job_location, job_id)

  # find the job description
  try:
    job_description = browser.find_element(By.CLASS_NAME, 'jobs-box__html-content').text.strip()
    job_description = job_description.replace('\n',' ')
    #print("job_description: ", job_description)
  except:
    print("job_description exception occurred", job_description, job_id)
  
  return [job_id, job_title, company_name, company_link, job_location, publish_date, applicant_count, job_type, job_level, company_size, company_industry, connection_count, connection_type, job_description]

def get_job_page_data(browser, job_id_list_file):
  # job data list
  job_data = []

  # read ids from csv file saved in previous step
  df = pd.read_csv(job_id_list_file)
  job_id_list = df['job_id'].drop_duplicates().values.tolist()

  # fetch all the job pages by job_id
  base_url = "https://www.linkedin.com/jobs/view/"
  
  for id in job_id_list:
    job_data.append(get_job_info(browser, base_url, id))

  df2 = pd.DataFrame(job_data, 
    columns =['job_id', 'job_title', 'company_name', 'company_link', 'job_location', 'publish_date', 'applicant_count', 'job_type', 'job_level', 'company_size', 'company_industry', 'connection_count', 'connection_type', 'job_description']) 

  return df2
  

#### Open the Chrome Browers and login to Linkedin

In [4]:
# Create the webdriver instance
browser = webdriver.Chrome("./chromedriver.exe")

# Credentials
email = "jian@jian.se"
password = "Jordan23!"

browser = loginLinkedIn(browser, email, password)


  browser = webdriver.Chrome("./chromedriver.exe")


#### Scrape the jobs data
Need to search the job title 'Data Analyst' with location 'Stockholm' manually before scraping the search results.
After searching manually, we find the layout of the result listed on the left panel for every 25 items per page and there are around 40 pages in total,
even though the total search results is 3048 for this search.

The URL of the first search is like this.
https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true

When I click on the page "2", the URL becomes
https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true&start=25

And for the page "3", the URL is
https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true&start=50

So that we can generate the URL of each page n by adding "&start=25*n" at the end of base URL.

To be able to see the job description of each jobs in the result list, we need to simulate the mouse event to "Click" on each item. The Action Chains of selenium can help me with it.


In [14]:
# To fetch the job data
base_url = "https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true"

# To fetch all pages
df = get_linkedin_job_data(browser, base_url, 3000)
df.to_csv('jobs.csv', index=False)
df.head(10)

size of elements:  8
1
2
3
4
5
6
7
8


KeyboardInterrupt: 

In [59]:
# fetch the job data by job ids
df_jobs = get_job_page_data(browser, 'jobs.copy.csv')

#df_jobs.head(10)

df_jobs.to_csv('job-data.csv', index=False)
 

985
