### Scrape the job data of DA from linkedin and analyze

#### libs and environment setup  

In [102]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import re



#### Function definitions

In [103]:
# Get the linkedin login page
def loginLinkedIn(browser, email, password):
  browser.get('https://www.linkedin.com/login')

  # wait 4s until all elements are loaded.
  time.sleep(4)

  # find the element ids with the Chrome inspector
  # send in username and password the corresponding input elements
  browser.find_element(By.ID, 'username').send_keys(email)
  browser.find_element(By.ID, 'password').send_keys(password)

  # send in 'submit' event
  browser.find_element(By.CLASS_NAME, 'btn__primary--large').click()
  
  return browser;

# scroll window function
def scroll(browser, timeout):
  # You can set your own pause time. My laptop is a bit slow so I use 1 sec
  scroll_pause_time = timeout 
  # get the screen height of the web
  screen_height = browser.execute_script("return window.screen.height;")   
  i = 1

  while True:
    # scroll one screen height each time
    browser.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = browser.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break

# Get the job info from the '<li>' element of search results
def get_job_info(element):
  title = ''
  company_name = ''
  company_link = ''
  location = ''
  job_type = ''
  job_link = ''

  # find the job title
  try:
    title = element.find_element(By.CLASS_NAME, 'job-card-list__title').text
    #print("title: ", title)
  except:
    print("title exception occurred", title)

  # find the company name
  try:
    company_name = element.find_element(By.CLASS_NAME, 'job-card-container__company-name').text
    #print("company name: ", company_name)
  except:
    print("company_name exception occurred", company_name)

  # find the company link
  try:
    company_link = element.find_element(By.CLASS_NAME, 'job-card-container__company-name').get_attribute('href')
    #print("company link: ", company_link)
  except:
    print("company_link exception occurred", company_link)

  # find the location
  try:
    location = element.find_element(By.CLASS_NAME, 'job-card-container__metadata-item').text.strip()
    job_type = element.find_element(By.CLASS_NAME, 'job-card-container__metadata-item--workplace-type').text.strip()
    #print("location: ", location)
    #print("job type: ", job_type)
  except:
    print("location exception occurred", location)

  # find the job link
  try:
    job_link = element.find_element(By.CLASS_NAME, 'job-card-list__title').get_attribute('href')
    #print("job link: ", job_link)
  except:
    print("job_link exception occurred", job_link)
  
  return [['title','company_name','company_link','location','job_type','job_link']]

# Get the data from the search results
def get_linkedin_job_data(browser, base_url, max_results):
  # job data in 
  job_data = [['title','company_name','company_link','location','job_type','job_link']]
  df = pd.DataFrame(job_data)
  
  job_ids = []
  counter = 0

  # traverse all required pages
  num_pages = int(max_results/25)
  for p in range(num_pages):
    if counter >= max_results:
      break;

    # url of each page
    url = base_url + '&start=' + str((p+64)*25)
    
    # get each page
    browser.get(url)
    time.sleep(4)

    # read the job brief from the list
    elements = browser.find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
    print("size of elements: ", len(elements))

    for e in elements:
      counter += 1

      if counter >= max_results:
        break;

      print(counter)

      try:
        job_id = e.get_attribute("data-occludable-job-id")
        job_ids.append(job_id)
      except:
        print("didn't find job id at ", counter)

  df = pd.DataFrame(job_ids, columns = ['job_id'])
  return df

def test_url(browser, url):
  browser.get(url)
  time.sleep(1)
  elements = browser.find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
  print("size of elements: ", len(elements))

  counter = 0
  for e in elements:
    counter += 1
    print(counter)
    print(e.get_attribute("data-occludable-job-id"))  

#### Open the Chrome Browers and login to Linkedin

In [104]:
# Create the webdriver instance
browser = webdriver.Chrome("./chromedriver.exe")

# Credentials
email = "jian@jian.se"
password = "Jordan23!"

browser = loginLinkedIn(browser, email, password)


  browser = webdriver.Chrome("./chromedriver.exe")


#### Scrape the jobs data
Need to search the job title 'Data Analyst' with location 'Stockholm' manually before scraping the search results.
After searching manually, we find the layout of the result listed on the left panel for every 25 items per page and there are around 40 pages in total,
even though the total search results is 3048 for this search.

The URL of the first search is like this.
https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true

When I click on the page "2", the URL becomes
https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true&start=25

And for the page "3", the URL is
https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true&start=50

So that we can generate the URL of each page n by adding "&start=25*n" at the end of base URL.

To be able to see the job description of each jobs in the result list, we need to simulate the mouse event to "Click" on each item. The Action Chains of selenium can help me with it.


In [105]:
# To fetch the job data
base_url = "https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true"

# To fetch all pages
df = get_linkedin_job_data(browser, base_url, 3000)
df.to_csv('jobs.csv', index=False)
df.head(10)

#test_url(browser, "https://www.linkedin.com/jobs/search/?currentJobId=3185920126&geoId=100907646&keywords=data%20analyst&location=Stockholm%2C%20Stockholm%20County%2C%20Sweden&refresh=true&start=9")

#scroll(browser,2)



size of elements:  8
1
2
3
4
5
6
7
8


KeyboardInterrupt: 