# Scraping Job info from LinkedIn

<a href="https://medium.com/nerd-for-tech/linked-in-web-scraper-using-selenium-15189959b3ba">Tutorial here.</a>


In [None]:
import sys
sys.path.append('..')

import time
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from bot.Scraper import Scraper
from src.database.csv import save_to_csv

## Initialisation

In [None]:
RAW_FOLDER = '../data/raw/'

In [2]:
# Create a scraper object
scraper = Scraper(delay=1)

# Login to LinkedIn
scraper.login()

## Search jobs

### Debugging

In [None]:
# Search jobs by title and location
job_title = 'data scientist'
job_location = 'canada'
scraper.search_jobs(job_title, job_location)

# Get the  pagination buttons
time.sleep(2)
pagination_buttons = scraper.get_pagination_buttons()

jobs = scraper.get_current_page_jobs()
pagination = scraper.driver.find_element(By.CLASS_NAME, "jobs-search-pagination__pages")
pagination_buttons = pagination.find_elements(By.XPATH, './/button')
pagination_buttons[0].click()

### Collect Job Data

Search for jobs and loop through all pages to get the data. 

In [None]:
def collect_job_data(job_title, job_location, delay=1):
    
    # Search jobs by title and location
    scraper.search_jobs(job_title, job_location)
    time.sleep(delay)

    # Get the  pagination buttons
    pagination_buttons = scraper.get_pagination_buttons()
    print(f"{len(pagination_buttons)} pages found")

    # Loop through each page
    # [:-1] is to ignore the last page as we click on the i+1 button
    all_jobs = []
    for i, button in enumerate(pagination_buttons[:-1]):

        try:
            print("button", button)
            # get the jobs of the current page
            current_page_jobs = scraper.get_current_page_jobs()
            all_jobs.append(current_page_jobs)

            # navigate to the next page
            # button.click()
            # Get the buttons again to avoid a StaleElementReferenceException
            pagination_buttons = scraper.get_pagination_buttons()
            pagination_buttons[i+1].click()

        except StaleElementReferenceException as e:
            print('The was an error...')
            print(e)
        except Exception as e:
            print('The was an error...')
            print(e)

        return all_jobs

In [None]:
# def flatten_jobs_array(nested_job_data):
#     flattened_jobs = []

#     for job_group in nested_job_data:
#         if job_group is None:
#             flattened_jobs.append([None, None, None, None])
#             continue

#         for job_entry in job_group:
#             if job_entry is None:
#                 flattened_jobs.append([None, None, None, None])
#                 continue

#             job_details = []

#             for field in job_entry:
#                 try:
#                     if isinstance(field, WebElement):
#                         job_details.append(field)  # Extract and clean text
#                     else:
#                         job_details.append(field)
#                 except Exception:
#                     job_details.append(field)

#             flattened_jobs.append(job_details)

#     return flattened_jobs


from src.utils.utils import flatten_jobs_array

### Run the search

In [None]:
job_titles = ["data scientist", "llm engineer", "ai engineer", "machine learning engineer", "mlops engineer", "ai developer", "generative ai engineer"]
job_location = "canada"
all_jobs = []

In [None]:
for title in job_titles:
    try:
        print(f"Collecting data for {title}...")
        all_jobs = collect_job_data(title, job_location, delay=2)
        flattened_jobs = flatten_jobs_array(all_jobs)
        columns = ["Job Title", "Company", "City", "Work Mode", "Description"]

        save_to_csv(
            data=flattened_jobs, 
            folder=RAW_FOLDER, 
            filename=title,
            colnames=columns
        )
    except Exception as e:
        print(f"An error occurred while collecting data for {title}:")
        print(e)


Collecting data for generative ai engineer...
Found 5 search bars
Found 4 pagination buttons
4 pages found
button <selenium.webdriver.remote.webelement.WebElement (session="bb6a9bbcff1162c4a4dd5ee791e377ff", element="f.FBBCE7F329BA04BFEEDAF0097FBB2F05.d.FE0F3635C8E071E5B9AAC0AD5CAECEDB.e.3198")>
Found 4 pagination buttons
The was an error...
Message: element not interactable
  (Session info: chrome=137.0.7151.122)
Stacktrace:
	GetHandleVerifier [0x0x7ff6a6e4cda5+78885]
	GetHandleVerifier [0x0x7ff6a6e4ce00+78976]
	(No symbol) [0x0x7ff6a6c099fc]
	(No symbol) [0x0x7ff6a6c61c64]
	(No symbol) [0x0x7ff6a6c53654]
	(No symbol) [0x0x7ff6a6c88b8a]
	(No symbol) [0x0x7ff6a6c52f06]
	(No symbol) [0x0x7ff6a6c88da0]
	(No symbol) [0x0x7ff6a6cb122f]
	(No symbol) [0x0x7ff6a6c88963]
	(No symbol) [0x0x7ff6a6c516b1]
	(No symbol) [0x0x7ff6a6c52443]
	GetHandleVerifier [0x0x7ff6a7124eed+3061101]
	GetHandleVerifier [0x0x7ff6a711f33d+3037629]
	GetHandleVerifier [0x0x7ff6a713e592+3165202]
	GetHandleVerifier [0x0x

In [None]:
scraper.close_driver()