# The Scraper Bot

In this notebook, we create a bot that will automatically collect job postings from LinkedIn.

Shoutout to Matan Freedman for the inspiration and the initial code structure:
<a href="https://medium.com/nerd-for-tech/linked-in-web-scraper-using-selenium-15189959b3ba">article here.</a>


In [1]:
import sys
sys.path.append('..')

import time
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from src.bot.Scraper import Scraper
from src.database.csv import save_to_csv
from src.utils.utils import flatten_jobs_array

## Initialisation

In [2]:
RAW_FOLDER = '../data/raw/'

In [3]:
# Create a scraper object
scraper = Scraper(delay=1)

# Login to LinkedIn
scraper.login()

Logging in...


## Search jobs

### Debugging

In [4]:
# Search jobs by title and location
job_title = 'data scientist'
job_location = 'canada'
scraper.search_jobs(job_title, job_location)

# Get the  pagination buttons
time.sleep(2)
pagination_buttons = scraper.get_pagination_buttons()

Searching for jobs: data scientist in canada
Found 5 search bars
Getting pagination buttons...
Found 4 pagination buttons


In [5]:
current_page_jobs = scraper.get_current_page_jobs()
current_page_jobs

Getting current page jobs...
Scrolling all job listings...
Reached bottom of job list.
Scrolling all job listings...
Reached bottom of job list.
Found 12 job cards.
Getting job skills...


KeyboardInterrupt: 

In [None]:
pagination_buttons = scraper.get_pagination_buttons()
pagination_buttons[1].click()

### Collect Job Data

Search for jobs and loop through all pages to get the data. 

In [5]:
def collect_job_data(job_title, job_location, delay=3):
    
    # Search jobs by title and location
    scraper.search_jobs(job_title, job_location)
    time.sleep(delay)

    # Get the  pagination buttons
    pagination_buttons = scraper.get_pagination_buttons()
    print(f"{len(pagination_buttons)} pages found")

    # Loop through each page
    # [:-1] is to ignore the last page as we click on the i+1 button
    all_jobs = []
    for i, button in enumerate(pagination_buttons[:-1]):
        time.sleep(delay)
        
        try:
            print("button", button)
            # get the jobs of the current page
            current_page_jobs = scraper.get_current_page_jobs()
            all_jobs.append(current_page_jobs)

            # navigate to the next page
            # button.click()
            # Get the buttons again to avoid a StaleElementReferenceException
            pagination_buttons = scraper.get_pagination_buttons()
            pagination_buttons[i+1].click()

        except StaleElementReferenceException as e:
            print('The was an error...')
            print(e)
        except Exception as e:
            print('The was an error...')
            print(e)

        return all_jobs

### Run the search

In [9]:
# # Search jobs by title and location
# scraper.search_jobs(job_title, job_location)
# time.sleep(3)


# get the jobs of the current page
current_page_jobs = scraper.get_current_page_jobs()


Getting current page jobs...
Scrolling all job listings...
Reached bottom of job list.
Found 20 job cards.
Getting job skills...


KeyboardInterrupt: 

In [7]:
job_titles = ["data scientist", "llm engineer", "ai engineer", "machine learning engineer", "mlops engineer", "ai developer", "generative ai engineer"]
job_location = "canada"
all_jobs = []

In [8]:
# all_jobs = []
# current_page_jobs = scraper.get_current_page_jobs()
jobs_list = scraper.driver.find_elements(By.CLASS_NAME, "job-card-list")
len(jobs_list)

20

In [8]:
for title in job_titles:
    try:
        print(f"Collecting data for {title}...")
        all_jobs = collect_job_data(title, job_location, delay=3)
        flattened_jobs = flatten_jobs_array(all_jobs)
        columns = ["Job Title", "Company", "City", "Work Mode", "Description"]

        save_to_csv(
            data=flattened_jobs, 
            folder=RAW_FOLDER, 
            filename=title,
            colnames=columns
        )
    except Exception as e:
        print(f"An error occurred while collecting data for {title}:")
        print(e)


Collecting data for data scientist...
Searching for jobs: data scientist in canada
Found 5 search bars
Getting pagination buttons...
Found 4 pagination buttons
4 pages found
button <selenium.webdriver.remote.webelement.WebElement (session="830b1fbe909244dcbf608038bee0eae9", element="f.7F345624673A49DA6E3FBFEC1A6BABA4.d.973EDE23C0A85AEA2B34D32DF4AE0463.e.366")>
Getting current page jobs...
Scrolling all job listings...
Reached bottom of job list.
Found 12 job cards.
The was an error...
'int' object is not iterable
An error occurred while collecting data for data scientist:
Shape of passed values is (0, 1), indices imply (0, 5)
Collecting data for llm engineer...
Searching for jobs: llm engineer in canada
Found 5 search bars
Getting pagination buttons...
Found 4 pagination buttons
4 pages found
button <selenium.webdriver.remote.webelement.WebElement (session="830b1fbe909244dcbf608038bee0eae9", element="f.7F345624673A49DA6E3FBFEC1A6BABA4.d.A41F99DF6D45476E2E54A61B027702A9.e.542")>
Getting

KeyboardInterrupt: 

In [None]:
scraper.close_driver()