In [2]:
pip install selenium




In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Define the path to the Chrome driver executable
path = r"C:\Users\Dell\OneDrive\Downloads\chromedriver.exe"

# Create a Service object with the driver path
service = Service(path)

# Initialize the Chrome WebDriver with the Service object
driver = webdriver.Chrome(service=service)

# Maximize the browser window
driver.maximize_window()

# Set an implicit wait for elements
driver.implicitly_wait(10)

# Enter the site
driver.get("https://www.linkedin.com/login")
time.sleep(2)

# User Credentials
with open('user_credentials.txt', 'r', encoding="utf-8") as file:
    user_credentials = file.readlines()
    user_credentials = [line.strip() for line in user_credentials]

user_name = user_credentials[0]  # First line
password = user_credentials[1]   # Second line

# Wait for the username and password fields to be present
username_field = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '#username'))
)
username_field.send_keys(user_name)

password_field = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '#password'))
)
password_field.send_keys(password)

# Click the login button
login_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[type="submit"]'))
)
login_button.click()

# Navigate to the Jobs search page directly
driver.get("https://www.linkedin.com/jobs/search/?keywords=Data%20Analyst&location=Canada")
time.sleep(10)  # Increase wait for the page to load

# Initialize lists to store job details
links = []
companies = []
job_titles = []
locations = []
post_dates = []
workplace_types = []
job_types = []
position_types = []

# Loop through the pages
while True:
    try:
        # Locate the jobs block
        jobs_block = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'scaffold-layout__list-container'))
        )
        print("Jobs block found")

        # Locate individual job offer cards
        jobs_list = jobs_block.find_elements(By.CSS_SELECTOR, '.jobs-search-results__list-item')
        print(f"Found {len(jobs_list)} job listings")

        # Loop through job cards to collect links
        for job in jobs_list:
            # Scroll down for each job element
            driver.execute_script("arguments[0].scrollIntoView();", job)

            # Wait until the job title link is clickable
            job_title_link = WebDriverWait(job, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.job-card-list__title'))
            )
            link = job_title_link.get_attribute('href')
            if link and link.startswith('https://www.linkedin.com/jobs/view'):
                links.append(link)
                print(f"Collected link: {link}")

        # Try to locate and click the "Next" page button
        try:
            next_page_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.jobs-search-pagination__button--next'))
            )
            next_page_button.click()
            print("Navigating to the next page")
            time.sleep(5)  # Adjust wait time for the next page to load
        except Exception:
            print("No more pages found or navigation to the next page failed.")
            break

    except Exception as e:
        print(f"Error on page: {str(e)}")
        break  # Exit loop if there is an error

# Print collected job links
print("Collected job links:")
for link in links:
    print(link)

# Function to extract job details from each job link
def extract_job_details(job_url):
    driver.get(job_url)
    time.sleep(5)  # Allow the page to load

    try:
        # Extract the company name
        company_name_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__company-name a'))
        )
        company_name = company_name_element.text

        # Extract the job title
        job_title_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__job-title h1'))
        )
        job_title = job_title_element.text

        # Extract location and post date
        primary_description_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__primary-description-container'))
        )
        description_text = primary_description_element.text
        description_parts = description_text.split('·')

        if len(description_parts) >= 2:
            location = description_parts[0].strip()  # The location
            post_date = description_parts[1].strip()    # The post date
        else:
            location = None
            post_date = None

        # Extract workplace, job type, and position type
        job_insights_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.job-details-jobs-unified-top-card__job-insight'))
        )

        # Initialize variables for workplace, job, and position types
        workplace_type = []
        job_type = []
        position_type = []

        # Loop through job insights elements to determine types
        for insight in job_insights_elements:
            insight_text = insight.text
            
            # Check for workplace types
            if "Remote" in insight_text and "Hybrid" not in insight_text and "On-site" not in insight_text:
                workplace_type.append("Remote")
            elif "Hybrid" in insight_text and "Remote" not in insight_text and "On-site" not in insight_text:
                workplace_type.append("Hybrid")
            elif "On-site" in insight_text and "Remote" not in insight_text and "Hybrid" not in insight_text:
                workplace_type.append("On-site")

            # Check for job types
            if "Full-time" in insight_text:
                job_type.append("Full-time")
            elif "Part-time" in insight_text:
                job_type.append("Part-time")

            # Check for position types
            if "Associate" in insight_text:
                position_type.append("Associate")
            elif "Entry level" in insight_text:
                position_type.append("Entry Level")
            elif "Mid-senior level" in insight_text:
                position_type.append("Mid-Senior Level")

        # Convert lists to unique values
        workplace_type = list(set(workplace_type))
        job_type = list(set(job_type))
        position_type = list(set(position_type))

        # Store the job details
        companies.append(company_name)
        job_titles.append(job_title)
        locations.append(location)
        post_dates.append(post_date)
        workplace_types.append(', '.join(workplace_type))
        job_types.append(', '.join(job_type))
        position_types.append(', '.join(position_type))

    except Exception as e:
        print(f"Error extracting job details: {str(e)}")

# Extract details for each collected job link
for link in links:
    extract_job_details(link)

# Create a DataFrame from the collected data
job_data = {
    "Company": companies,
    "Job Title": job_titles,
    "Location": locations,
    "Post Date": post_dates,
    "Workplace Type": workplace_types,
    "Job Type": job_types,
    "Position Type": position_types
}
job_df = pd.DataFrame(job_data)

# Save the DataFrame to a CSV file
job_df.to_csv('job_offers.csv', index=False)

# Close the browser after scraping
driver.quit()


Jobs block found
Found 25 job listings
Collected link: https://www.linkedin.com/jobs/view/4052387360/?eBP=CwEAAAGS2Mt9ApEvUBT4jVH-jw5_klZR71pY-qgQ_MFBckMfCPypgLzz_CYFKvcoTm74V_Smi711mz_vF4IbfYFT4Z1lCtEg29oaLElmGPB5yVJdcHUbmDNVxTvy48Nc0LByQa3I_OFEOxjZCGv04MbS6TcfvZ6MO5p-bPtPDBbXJVgWolMAICzfUjZKNbM1IG8xydI9KTo5L5f_yb92K71ENQW_UBY8EGIL1ca9iy6dlom3BtONibuAFUTtEOgqI_Pegd9FFGThD5zd_cpUrfaHnWBFP2T58FmXjMLaEveAYTk3cgqEHIJoMPqtjv-_sHxuA7PhP0bsBB_XJYixwzTQEaGYewHWnUeCFUYa9o7uR68uNx-ClNE5MXRt1MJdJNcmAZWudSUSuxzvcGaZZdUmU_kZp7oZXAkRmjx1Yb0umRWLdsLo4gX4bG4Z3px-XfZIpipHyMHx-dUclbi4uBGhLQwsiQ-z9hIQqg&refId=51FKY9ihS0J3DFatsJb54Q%3D%3D&trackingId=xCW%2BIljiM2bpE%2F%2Bqs3TwFg%3D%3D&trk=flagship3_search_srp_jobs
Collected link: https://www.linkedin.com/jobs/view/4061297966/?eBP=CwEAAAGS2Mt9AiINR55sMI0HLGtgv9ncn4G9-OSyhHiaI7PQOAnLWbfb_Js3zSF2evEQu_jrL_DhA4zrSQAaZ_YEAKV0CazOvCW2CNvC_75_Z2Shxp58ctkl3qEn854DbQn7V63PS4H40PLNXSw4jRwxlZjlZuSbwOqktC7_ECSyPye56xozKQOmZoM0WB2dgRTjcsO2H-9ZhWM_4Lxf6G4XrgErnGHVctNxp