In [7]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import logging
import pickle
import os
import numpy as np

In [15]:
class LinkedInBot:
    def __init__(self, delay=5):
        if not os.path.exists("data"):
            os.makedirs("data")
        log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        logging.basicConfig(level=logging.INFO, format=log_fmt)
        self.delay=delay
        logging.info("Starting driver")
        self.driver = webdriver.Chrome(ChromeDriverManager().install())

    def login(self, email, password):
        """Go to linkedin and login"""
        # go to linkedin:
        logging.info("Logging in")
        self.driver.maximize_window()
        self.driver.get('https://www.linkedin.com/login')
        time.sleep(self.delay)

        self.driver.find_element('id','username').send_keys(email)
        self.driver.find_element('id','password').send_keys(password)

        self.driver.find_element('id','password').send_keys(Keys.RETURN)
        time.sleep(self.delay)

    def save_cookie(self, path):
        with open(path, 'wb') as filehandler:
            pickle.dump(self.driver.get_cookies(), filehandler)

    def load_cookie(self, path):
        with open(path, 'rb') as cookiesfile:
            cookies = pickle.load(cookiesfile)
            for cookie in cookies:
                self.driver.add_cookie(cookie)

    def search_linkedin(self, keywords, location):
        """Enter keywords into search bar
        """
        logging.info("Searching jobs page")
        self.driver.get("https://www.linkedin.com/jobs/")
        # search based on keywords and location and hit enter
        self.wait_for_element_ready(By.CLASS_NAME, 'jobs-search-box__text-input')
        time.sleep(self.delay)
        search_bars = self.driver.find_elements(By.CLASS_NAME, 'jobs-search-box__text-input')
        search_keywords = search_bars[0]
        search_keywords.send_keys(keywords)
        time.sleep(self.delay)
        search_location = search_bars[3]
        time.sleep(self.delay)
        search_location.send_keys(location)
        time.sleep(self.delay)
        search_location.send_keys(Keys.RETURN)
        logging.info("Keyword search successful")
        time.sleep(self.delay)
    
    def wait(self, t_delay=None):
        """Just easier to build this in here.
        Parameters
        ----------
        t_delay [optional] : int
            seconds to wait.
        """
        delay = self.delay if t_delay == None else t_delay
        time.sleep(delay)

    def scroll_to(self, job_list_item):
        """Just a function that will scroll to the list item in the column 
        """
        self.driver.execute_script("arguments[0].scrollIntoView();", job_list_item)
        #self.wait_for_element_ready(By.CLASS_NAME, job_list_item)
        job_list_item.click()
        time.sleep(self.delay)
    
    def get_position_data(self, job):
        """Gets the position data for a posting.
        Parameters
        ----------
        job : Selenium webelement
        Returns
        -------
        list of strings : [position, company, location, details]
        """
        try:
            position = job.text.split('\n')[0]
        except:
            position = np.nan
        try:
            company = job.text.split('\n')[1]
        except:
            company = np.nan
        try:
            location = job.text.split('\n')[2]
        except:
            location = np.nan
        #[position, company, location] = job.text.split('\n')[:3]
        try:
            details = self.driver.find_element('id', "job-details").text
        except:
            details = np.nan
        return [position, company, location, details]

    def wait_for_element_ready(self, by, text):
        try:
            WebDriverWait(self.driver, self.delay).until(EC.presence_of_element_located((by, text)))
        except TimeoutException:
            logging.debug("wait_for_element_ready TimeoutException")
            pass

    def close_session(self):
        """This function closes the actual session"""
        logging.info("Closing session")
        self.driver.close()

    def run(self, email, password, keywords, location):
        if os.path.exists("data/cookies.txt"):
            self.driver.get("https://www.linkedin.com/")
            self.load_cookie("data/cookies.txt")
            self.driver.get("https://www.linkedin.com/")
        else:
            self.login(
                email=email,
                password=password
            )
            self.save_cookie("data/cookies.txt")

        logging.info("Begin linkedin keyword search")
        self.search_linkedin(keywords, location)
        self.wait()

        # scrape pages,only do first 8 pages since after that the data isn't 
        # well suited for me anyways:  
        self.position_list = []
        self.company_list = []
        self.location_list = []
        self.details_list = []

        for page in range(2, 3):
            # get the jobs list items to scroll through:
            jobs = self.driver.find_elements(By.CLASS_NAME, "occludable-update")
            for job in jobs:
                self.scroll_to(job)
                #[position, company, location, details] = self.get_position_data(job)
                descriptions = self.get_position_data(job)    
                self.position_list.append(descriptions[0])
                self.company_list.append(descriptions[1])
                self.location_list.append(descriptions[2])
                self.details_list.append(descriptions[3])

            # go to next page:
            bot.driver.find_element(By.XPATH, f"//button[@aria-label='Page {page}']").click()
            bot.wait()
        logging.info("Done scraping.")
        logging.info("Closing DB connection.")
        bot.close_session()


In [16]:
# Declare inputs 
print('Input Email')
email = 'emilosasso@gmail.com' #input()
print('Input Password')
password = 'lost4815162342'#input()

email = email
password = password
bot = LinkedInBot()
bot.run(email, password, "Data Scientist", "Canada")

2022-08-09 22:52:45,815 - root - INFO - Starting driver
2022-08-09 22:52:45,895 - WDM - INFO - Get LATEST chromedriver version for google-chrome 104.0.5112


Input Email
Input Password


2022-08-09 22:52:46,135 - WDM - INFO - Driver [/Users/elosasso/.wdm/drivers/chromedriver/mac64_m1/104.0.5112/chromedriver] found in cache
  self.driver = webdriver.Chrome(ChromeDriverManager().install())
2022-08-09 22:52:54,844 - root - INFO - Begin linkedin keyword search
2022-08-09 22:52:54,846 - root - INFO - Searching jobs page
2022-08-09 22:53:16,492 - root - INFO - Keyword search successful


ElementNotInteractableException: Message: element not interactable: element has zero size
  (Session info: chrome=104.0.5112.79)
Stacktrace:
0   chromedriver                        0x000000010320eae0 chromedriver + 3828448
1   chromedriver                        0x00000001031a3f1c chromedriver + 3391260
2   chromedriver                        0x0000000102e9cfcc chromedriver + 217036
3   chromedriver                        0x0000000102ecdbc8 chromedriver + 416712
4   chromedriver                        0x0000000102ec391c chromedriver + 375068
5   chromedriver                        0x0000000102ec3308 chromedriver + 373512
6   chromedriver                        0x0000000102ef5b2c chromedriver + 580396
7   chromedriver                        0x0000000102ec2010 chromedriver + 368656
8   chromedriver                        0x00000001031e439c chromedriver + 3654556
9   chromedriver                        0x00000001031e7c4c chromedriver + 3669068
10  chromedriver                        0x00000001031ec14c chromedriver + 3686732
11  chromedriver                        0x00000001031e8654 chromedriver + 3671636
12  chromedriver                        0x00000001031c6b40 chromedriver + 3533632
13  chromedriver                        0x0000000103200414 chromedriver + 3769364
14  chromedriver                        0x0000000103200578 chromedriver + 3769720
15  chromedriver                        0x00000001032150f0 chromedriver + 3854576
16  libsystem_pthread.dylib             0x000000018a9bf878 _pthread_start + 320
17  libsystem_pthread.dylib             0x000000018a9ba5e0 thread_start + 8


In [6]:
len(bot.details_list)

25