## Required Packages

In [2]:
from enum import Enum
from math import ceil
from typing import Optional
import os, time, logging, pandas as pd, re
from urllib.parse import quote_plus

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException,
    NoSuchWindowException, WebDriverException,
    ElementClickInterceptedException, StaleElementReferenceException
)

from IPython.display import display

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")



class ExperienceFilter(Enum):
    ALL = ""
    INTERNSHIP = "1"
    ENTRY_LEVEL = "2"
    ASSOCIATE = "3"
    MID_SENIOR_LEVEL = "4"
    DIRECTOR = "5"
    EXECUTIVE = "6"

class JobPostTime(Enum):
    ANY_TIME = ""
    PAST_24_HOURS = "r86400"
    PAST_WEEK = "r604800"
    PAST_MONTH = "r2592000"




## Use chrome driver to search for job postinigs

In [3]:
# Selenium is a automation tool that can control a web browser. It could do human actions in web pages like click, scroll, input text, etc. Which can be used to bypass some anti-scraping mechanisms.

driver = None

def make_driver(headless: bool = False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--lang=en-US")
    return webdriver.Chrome(options=opts)

def ensure_window(drv, headless: bool = False):
    global driver
    if drv is None:
        driver = make_driver(headless=headless)
        return driver
    try:
        _ = drv.current_url
        if not drv.window_handles:
            raise NoSuchWindowException("No window handles")
        return drv
    except Exception:
        logging.warning("Driver/window lost. Recreating browser...")
        try:
            drv.quit()
        except Exception:
            pass
        driver = make_driver(headless=headless)
        return driver

def safe_get(drv, url: str, retries: int = 1, headless: bool = False):
    global driver
    for _ in range(retries + 1):
        try:
            drv.get(url)
            return drv
        except WebDriverException as e:
            logging.warning(f"driver.get failed ({e.__class__.__name__}); recreating…")
            drv = ensure_window(drv, headless=headless)
            driver = drv
    return drv


In [4]:
from getpass import getpass

def try_login(drv, email: str, password: str) -> None:
    global driver
    driver = ensure_window(drv)
    driver = safe_get(driver, "https://www.linkedin.com/login")
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "username")))
    u = driver.find_element(By.ID, "username"); u.clear(); u.send_keys(email)
    p = driver.find_element(By.ID, "password"); p.clear(); p.send_keys(password); p.send_keys(Keys.RETURN)
    WebDriverWait(driver, 25).until(
        EC.any_of(
            EC.presence_of_element_located((By.ID, "global-nav-search")),
            EC.presence_of_element_located((By.ID, "input__email_verification_pin")),
            EC.url_contains("checkpoint")
        )
    )

def _maybe_accept_cookies_and_check_login(drv) -> bool:
    for sel in [
        "button[aria-label*='Accept']",
        "button[data-test-global-nav-cookie-banner-accept]",
        ".artdeco-global-alert-action button",
    ]:
        try:
            for b in drv.find_elements(By.CSS_SELECTOR, sel):
                if b.is_displayed() and b.is_enabled():
                    b.click()
        except Exception:
            pass
    cur = drv.current_url
    if "login" in cur or "checkpoint" in cur:
        return False
    try:
        WebDriverWait(drv, 6).until(EC.presence_of_element_located((By.CSS_SELECTOR, "main, body")))
        return True
    except Exception:
        return False

def login_wall_present(drv):
    try:
        modals = drv.find_elements(By.CSS_SELECTOR, '.artdeco-modal, [role="dialog"]')
        return any(m.is_displayed() for m in modals)
    except Exception:
        return False


# LinkedIn has many different languages, so the dismiss button could be in different languages. Here we try to cover some common ones.
def dismiss_login_wall(drv, timeout=3) -> bool:
    selectors = [
        'button.artdeco-modal__dismiss',
        'button[aria-label*="Dismiss"]',
        'button[aria-label*="Close"]',
        'button[aria-label*="关闭"]',
        'button[aria-label*="關閉"]',
    ]
    for sel in selectors:
        try:
            btn = WebDriverWait(drv, timeout).until(EC.element_to_be_clickable((By.CSS_SELECTOR, sel)))
            drv.execute_script("arguments[0].click();", btn)
            WebDriverWait(drv, 5).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, '.artdeco-modal, [role="dialog"]')))
            return True
        except Exception:
            pass
    try:
        drv.find_element(By.TAG_NAME, "body").send_keys(Keys.ESCAPE)
        WebDriverWait(drv, 2).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, '.artdeco-modal, [role="dialog"]')))
        return True
    except Exception:
        return False

# Credentials 
EMAIL = "zh272@georgetown.edu"

# Protect your password
PASSWORD = os.getenv("LI_PASSWORD") or getpass("Enter your LinkedIn password (hidden): ")


In [5]:
# CSS selectors for job listings container and items, which help to locate job postings on the page.
CONTAINER_SELECTORS = [
    "div.jobs-search-results-list",
    ".scaffold-layout__list > div",
    "div.jobs-search-two-pane__results",
    "div.jobs-search-results-list__container",
    "section.two-pane-serp-page__results-list",
]

ITEM_SELECTORS = [
    "li[data-occludable-job-id]",
    "ul.jobs-search-results__list > li",
    "li.jobs-search-results__list-item",
    "div.jobs-search-results__list-item",
    "ul.jobs-search__results-list li",
    "div.base-card, div.job-card-container",
]


def wait_for_job_items(drv, timeout=20):
    any_sel = ", ".join(ITEM_SELECTORS)
    WebDriverWait(drv, timeout).until(EC.presence_of_element_located((By.CSS_SELECTOR, any_sel)))

def safe_text(el):
    return el.text.strip() if el else ""

def find_one(drv, *locator, timeout=8, visibility=True):
    cond = EC.visibility_of_element_located if visibility else EC.presence_of_element_located
    try:
        return WebDriverWait(drv, timeout).until(cond(locator))
    except Exception:
        return None

def get_title_company_location(drv):
    # Title
    title = ""
    for sel in [".job-view-layout h1", "h1.jobs-unified-top-card__job-title", "h1"]:
        els = drv.find_elements(By.CSS_SELECTOR, sel)
        if els and els[0].text.strip():
            title = els[0].text.strip(); break

    # Company
    company = ""
    for sel in [
        "[data-view-name='job-details-about-company-name-link']",
        "a.jobs-unified-top-card__company-name",
        ".jobs-unified-top-card__company-name a",
        ".jobs-unified-top-card__company-name",
    ]:
        els = drv.find_elements(By.CSS_SELECTOR, sel)
        if els and els[0].text.strip():
            company = els[0].text.strip(); break

    # Location: keep Remote/Hybrid/On-site and specific city/region. If it only has country, then keep empty
    import re
    location = ""
    sel_list = [
        ".jobs-unified-top-card__primary-description-container li",
        ".jobs-unified-top-card__primary-description-container span",
        ".job-details-jobs-unified-top-card__primary-description-container li",
        ".job-details-jobs-unified-top-card__primary-description-container span",
        ".jobs-unified-top-card__subtitle-primary-grouping span",
        ".jobs-unified-top-card__subtitle-primary-grouping",
        ".jobs-unified-top-card__bullet",
        ".jobs-unified-top-card__workplace-type",
    ]
    candidates = []
    for sel in sel_list:
        for e in drv.find_elements(By.CSS_SELECTOR, sel):
            t = e.text.strip()
            if t:
                candidates.append(t)

    for t in candidates:
        low = t.lower()
        if any(k in low for k in ["remote", "hybrid", "on-site", "onsite"]):
            location = t; break

    if not location:
        for t in candidates:
            if re.search(r".+,\s+.+", t):  
                location = t; break

    if location and location.strip().lower() in {"united states", "usa"}:
        location = ""

    return title, company, location


In [6]:
# Most job descriptions are partially hidden and need to be expanded by clicking "See more" or similar buttons. This function tries to find and click those buttons to reveal the full job description.
def expand_all_in(drv, root=None, max_clicks=10):
    selectors = [
        "button.show-more-less-html__button",
        "button[aria-label*='See more']",
        "button[aria-label*='Show more']",
        "button[aria-controls*='description']",
        "button[aria-expanded='false']",
        "button[aria-label*='显示更多']",
        "button[aria-label*='顯示更多']",
    ]
    did = False
    for _ in range(max_clicks):
        clicked_this_round = False
        scope = root if root is not None else drv
        for sel in selectors:
            for b in scope.find_elements(By.CSS_SELECTOR, sel):
                try:
                    if not b.is_displayed() or not b.is_enabled():
                        continue
                    drv.execute_script("arguments[0].scrollIntoView({block:'center'});", b)
                    time.sleep(0.15)
                    drv.execute_script("arguments[0].click();", b)
                    did = True; clicked_this_round = True
                    time.sleep(0.2)
                except Exception:
                    pass
        if not clicked_this_round:
            break
    return did

def scroll_until_all_jobs_load(drv, container, pause=0.8, max_tries=30, item_selector="li.jobs-search-results__list-item"):
    tries = 0
    last_height = drv.execute_script("return arguments[0].scrollHeight", container)
    while tries < max_tries:
        drv.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", container)
        time.sleep(pause)
        new_height = drv.execute_script("return arguments[0].scrollHeight", container)
        if new_height == last_height:
            break
        last_height = new_height; tries += 1
    items = drv.find_elements(By.CSS_SELECTOR, item_selector)
    if not items:
        items = drv.find_elements(By.CSS_SELECTOR, "ul.jobs-search-results__list > li, li[data-occludable-job-id], ul.jobs-search__results-list li")
    return items


In [7]:
def _first_text(root, selectors):
    for sel in selectors:
        els = root.find_elements(By.CSS_SELECTOR, sel)
        if els:
            txt = els[0].text.strip()
            if txt:
                return txt
    return ""

def _first_el(root, selectors):
    for sel in selectors:
        els = root.find_elements(By.CSS_SELECTOR, sel)
        if els:
            return els[0]
    return None

def _first_attr(root, selectors, attr):
    el = _first_el(root, selectors)
    return el.get_attribute(attr) if el else ""


In [8]:
# Extract the jobs data from the job listings
def populate_jobs_data_minimal(jobs, nbOfJobs):
    job_data = {"Title": [], "Company": [], "Location": [], "Link": [], "Description": []}
    totalJobs = min(nbOfJobs, len(jobs))
    list_handle = driver.current_window_handle

    for idx, job in enumerate(jobs[:totalJobs]):
        try:
            link_el = _first_el(job, ["a.base-card__full-link", "a.job-card-container__link", "a[href*='/jobs/view/']"])
            if not link_el: continue
            link = link_el.get_attribute("href") or ""
            driver.execute_script("window.open(arguments[0], '_blank');", link)
            WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
            driver.switch_to.window(driver.window_handles[-1])

            WebDriverWait(driver, 12).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".jobs-unified-top-card, .job-view-layout, h1")))
            expand_all_in(driver, None, max_clicks=8)

            title, company, location = get_title_company_location(driver)

            desc_root = None
            for sel in ["#job-details", ".jobs-description__content", ".jobs-box__html-content", ".show-more-less-html__markup"]:
                found = driver.find_elements(By.CSS_SELECTOR, sel)
                if found: desc_root = found[0]; break
            if desc_root: expand_all_in(driver, desc_root, max_clicks=5)

            description = desc_root.text.strip() if desc_root else ""

            job_data["Company"].append(company)
            job_data["Title"].append(title)
            job_data["Location"].append(location)
            job_data["Link"].append(link)
            job_data["Description"].append(description)

        except Exception as e:
            logging.warning(f"[details-minimal] job {idx} skipped: {e}")
        finally:
            if len(driver.window_handles) > 1:
                driver.close()
                driver.switch_to.window(list_handle)
                if login_wall_present(driver):
                    dismiss_login_wall(driver)

    return pd.DataFrame(job_data)


# read data from the job cards in the listing page, without visiting each job's detail page
def extract_from_list_cards_min(max_items: int = 25) -> pd.DataFrame:
    cards = driver.find_elements(By.CSS_SELECTOR, "ul.jobs-search__results-list li")
    if not cards:
        cards = driver.find_elements(By.CSS_SELECTOR, "ul.jobs-search-results__list li")
    if not cards:
        cards = driver.find_elements(By.CSS_SELECTOR, "div.base-card, div.job-card-container")

    data = {"Title": [], "Company": [], "Location": [], "Link": [], "Description": []}
    for c in cards[:max_items]:
        try:
            title = _first_text(c, [".base-search-card__title", ".job-card-list__title", "h3"])
            company = _first_text(c, [
                ".base-search-card__subtitle a", ".base-search-card__subtitle",
                ".job-card-container__company-name", ".job-search-card__subtitle a",
                ".job-search-card__subtitle"
            ])
            location = _first_text(c, [
                ".job-search-card__location", "span.job-search-card__location",
                "li.job-card-container__metadata-item"
            ])
            link = _first_attr(c, ["a.base-card__full-link", "a.job-card-container__link", "a[href*='/jobs/view/']"], "href")

            if title or company or link:
                data["Title"].append(title); data["Company"].append(company); data["Location"].append(location)
                data["Link"].append(link); data["Description"].append("")
        except Exception:
            continue

    return pd.DataFrame(data)


In [9]:
# Scrape jobs from the current page
def scrape_jobs_from_page(nb_to_scrape: int) -> pd.DataFrame:
    if login_wall_present(driver):
        dismissed = dismiss_login_wall(driver)
        if not dismissed:
            print("Login wall present; cannot dismiss. Try logging in.")
            return pd.DataFrame()

    try:
        wait_for_job_items(driver, timeout=20)
    except TimeoutException:
        print("⚠️ No job cards visible (timeout).")
        return pd.DataFrame()

    container = None
    for sel in CONTAINER_SELECTORS:
        try:
            container = driver.find_element(By.CSS_SELECTOR, sel); break
        except Exception:
            continue

    for _ in range(2):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(0.4)
    if container:
        for _ in range(2):
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", container); time.sleep(0.4)

    items = []
    for sel in ITEM_SELECTORS:
        found = driver.find_elements(By.CSS_SELECTOR, sel)
        if found:
            items = found; break

    if container:
        try:
            items = scroll_until_all_jobs_load(driver, container)
        except Exception:
            pass

    if items:
        df = populate_jobs_data_minimal(items, nb_to_scrape)
        if not df.empty:
            return df

    df2 = extract_from_list_cards_min(max_items=nb_to_scrape)
    if not df2.empty:
        return df2

    return pd.DataFrame()


# In LinkedIn, one page contains 25 jobs by default. This function tries to navigate to the next page by clicking the "Next" button or a specific page number button.
def move_to_next_page(target_page_num: Optional[int] = None, timeout: int = 12) -> bool:
    try:
        wait = WebDriverWait(driver, timeout)
        for _ in range(2):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(0.3)

        clicked = False
        if target_page_num is not None:
            for xp in [
                f'//button[normalize-space()="{target_page_num}"]',
                f'//button[.//span[normalize-space()="{target_page_num}"]]'
            ]:
                try:
                    btn = wait.until(EC.element_to_be_clickable((By.XPATH, xp)))
                    driver.execute_script('arguments[0].scrollIntoView({block:"center"});', btn); time.sleep(0.2)
                    try:
                        btn.click()
                    except (ElementClickInterceptedException, StaleElementReferenceException):
                        driver.execute_script("arguments[0].click();", btn)
                    clicked = True; break
                except TimeoutException:
                    continue

        if not clicked:
            next_btns = driver.find_elements(By.CSS_SELECTOR, 'button[aria-label*="Next"], button[aria-label*="next"]')
            if not next_btns:
                next_btns = driver.find_elements(By.XPATH, '//button[normalize-space()="Next" or .//span[normalize-space()="Next"]]')
            next_btns = [b for b in next_btns if b.is_displayed() and b.is_enabled()]
            if not next_btns:
                return False
            btn = next_btns[0]
            driver.execute_script('arguments[0].scrollIntoView({block:"center"});', btn); time.sleep(0.2)
            try:
                btn.click()
            except (ElementClickInterceptedException, StaleElementReferenceException):
                driver.execute_script("arguments[0].click();", btn)

        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.jobs-search-results__list, .scaffold-layout__list, ul.jobs-search__results-list")))
        time.sleep(0.4)
        if login_wall_present(driver):
            dismiss_login_wall(driver)
        return True
    except (TimeoutException, StaleElementReferenceException):
        return False


JOBS_PER_PAGE = 25
# Build the LinkedIn job search URL based on the provided parameters, including keywords, location, experience level, and posting time.
def build_job_search_url(
    keywords: str = "Computer Science",
    location: str = "United States",
    experience: ExperienceFilter = ExperienceFilter.ALL,
    posted: JobPostTime = JobPostTime.ANY_TIME
) -> str:
    base = "https://www.linkedin.com/jobs/search"
    params = []
    if keywords:
        params.append(f"keywords={quote_plus(keywords)}")
    if location:
        params.append(f"location={quote_plus(location)}")
    if experience.value:
        params.append(f"f_E={experience.value}")
    if posted.value:
        params.append(f"f_TPR={posted.value}")
    params.append("position=1"); params.append("pageNum=0")
    return f"{base}?{'&'.join(params)}"


# Main function to scrape LinkedIn jobs, it could get the specified number of job postings by navigating through multiple pages if necessary.
def scrape_linkedin_jobs(nb_jobs: int, url: str, jobs_per_page: int = None, max_total: int = 1000) -> pd.DataFrame:
    global driver
    if jobs_per_page is None:
        jobs_per_page = JOBS_PER_PAGE
    driver = ensure_window(driver)
    driver = safe_get(driver, url)

    _maybe_accept_cookies_and_check_login(driver)
    if login_wall_present(driver):
        dismissed = dismiss_login_wall(driver)
        if not dismissed:
            if EMAIL and PASSWORD:
                try_login(driver, EMAIL, PASSWORD)
                driver = safe_get(driver, url)
                _maybe_accept_cookies_and_check_login(driver)
                if login_wall_present(driver):
                    dismiss_login_wall(driver)
            else:
                raise TimeoutException("Blocked by login wall and no credentials provided.")

    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
    try:
        wait_for_job_items(driver, timeout=20)
    except TimeoutException:
        pass

    total_pages_to_scrape = ceil(nb_jobs / jobs_per_page)
    current_page = 1
    total_df = pd.DataFrame()

    while True:
        remaining = nb_jobs - len(total_df)
        if remaining <= 0 or len(total_df) >= max_total:
            break
        nb_this_page = min(remaining, jobs_per_page)
        print(f"Scraping page {current_page} (need {nb_this_page} jobs from this page)")
        try:
            page_df = scrape_jobs_from_page(nb_this_page)
        except TimeoutException:
            try:
                driver.refresh()
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
                page_df = scrape_jobs_from_page(nb_this_page)
            except Exception:
                page_df = pd.DataFrame()
        if not page_df.empty:
            total_df = pd.concat([total_df, page_df], ignore_index=True)
            if "Link" in total_df.columns:
                total_df.drop_duplicates(subset=["Link"], keep="first", inplace=True, ignore_index=True)
        if current_page >= total_pages_to_scrape or len(total_df) >= nb_jobs or len(total_df) >= max_total:
            break
        moved = move_to_next_page(current_page + 1)
        if not moved:
            print("No further pages found; stopping.")
            break
        current_page += 1
        time.sleep(0.9)

    total_df.reset_index(drop=True, inplace=True)
    return total_df


In [10]:
_HEADING_MAP = {
    "responsibilities": "resp",
    "what you will do": "resp",
    "what you'll do": "resp",
    "what you do": "resp",
    "duties": "resp",
    "key duties": "resp",
    "role & responsibilities": "resp",
    "role and responsibilities": "resp",
    "requirements": "genreq",
    "must have": "genreq",
    "you have": "genreq",
    "required qualifications": "req",
    "minimum qualifications": "req",
    "basic qualifications": "req",
    "qualifications": "req",
    "preferred qualifications": "pref",
    "preferred": "pref",
    "nice to have": "pref",
    "bonus": "pref",
    "plus": "pref",
}
_heading_regex = re.compile(
    r"^\s*(?:" +
    r"|".join(re.escape(k) for k in sorted(_HEADING_MAP, key=len, reverse=True)) +
    r")\s*:?.*$",
    re.IGNORECASE
)

def _normalize_bullets(text: str) -> str:
    t = re.sub(r"\r\n?", "\n", text or "")
    t = re.sub(r"[•·▪●◦–—\-]\s*", "\n- ", t)
    t = re.sub(r"\n{2,}", "\n", t)
    return t.strip()

def parse_description_sections(text: str):
    text = _normalize_bullets(text)
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
    buckets = {"resp": [], "req": [], "pref": [], "genreq": []}
    current = None
    for ln in lines:
        lnl = ln.lower()
        if _heading_regex.match(lnl):
            for key, bucket in _HEADING_MAP.items():
                if key in lnl:
                    current = bucket; break
            continue
        item = ln.lstrip("- ").strip()
        if not item: continue
        if current == "resp" and len(buckets["resp"]) < 60: buckets["resp"].append(item)
        elif current == "req" and len(buckets["req"]) < 60: buckets["req"].append(item)
        elif current == "pref" and len(buckets["pref"]) < 60: buckets["pref"].append(item)
        elif current == "genreq" and len(buckets["genreq"]) < 60: buckets["genreq"].append(item)
    if not any(buckets.values()):
        bullets = [ln.lstrip("- ").strip() for ln in lines if ln.startswith("- ")]
        buckets["req"] = bullets[:30]
    return buckets

# Apply the parsing function to each job description in the DataFrame and create new columns for each section.
def split_description_sections(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["Responsibilities"] = [[] for _ in range(len(out))]
    out["QualificationsRequired"] = [[] for _ in range(len(out))]
    out["QualificationsPreferred"] = [[] for _ in range(len(out))]
    out["Requirements"] = [[] for _ in range(len(out))]
    for i, desc in enumerate(out["Description"].fillna("")):
        buckets = parse_description_sections(desc)
        out.at[i, "Responsibilities"] = buckets["resp"]
        out.at[i, "QualificationsRequired"] = buckets["req"]
        out.at[i, "QualificationsPreferred"] = buckets["pref"]
        out.at[i, "Requirements"] = buckets["genreq"]
    return out

# A convenience function that combines building the search URL, scraping the jobs, and parsing the descriptions into a single DataFrame.
def scrape_and_parse_linkedin_jobs(nb_jobs: int,
                                   keywords: str = "Computer Science",
                                   location: str = "United States",
                                   experience: ExperienceFilter = ExperienceFilter.ALL,
                                   posted: JobPostTime = JobPostTime.ANY_TIME) -> pd.DataFrame:
    """Returns a SINGLE DataFrame with Title, Company, Location, Link, Description and parsed columns."""
    url = build_job_search_url(keywords=keywords, location=location, experience=experience, posted=posted)
    df = scrape_linkedin_jobs(nb_jobs=nb_jobs, url=url)
    if df.empty:
        return df
    df = split_description_sections(df)
    # reorder columns for convenience
    cols = ["Title","Company","Location","Link","Description",
            "Responsibilities","QualificationsRequired","QualificationsPreferred","Requirements"]
    return df[[c for c in cols if c in df.columns] + [c for c in df.columns if c not in cols]]


In [11]:

# --- Step 1: Augment each scraped job with Salary, EmploymentType, and AboutTheCompany (Applicants removed) ---
import re, time
from pathlib import Path
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, ElementClickInterceptedException

_META_JOBTYPE_PATTERNS = [
    r"full[\s\-]?time", r"part[\s\-]?time", r"contract(?!or)", r"contractor", r"temporary",
    r"intern(ship)?", r"freelance", r"seasonal"
]

def _all_texts(drv, css_list=None, xp_list=None):
    out = []
    css_list = css_list or []
    xp_list = xp_list or []
    for sel in css_list:
        for e in drv.find_elements(By.CSS_SELECTOR, sel):
            t = (e.text or "").strip()
            if t:
                out.append(t)
    for xp in xp_list:
        for e in drv.find_elements(By.XPATH, xp):
            t = (e.text or "").strip()
            if t:
                out.append(t)
    return out

def _click_show_more_if_present(drv):
    variants = [
        '//button[contains(@aria-label, "Show more")]',
        '//button[.//span[contains(translate(., "SHOWMORE", "showmore"), "show more")]]',
        '//button[contains(translate(., "SHOW", "show"), "show more")]',
        '//button[contains(translate(., "SEE", "see"), "see more")]',
        '//button[contains(@class, "show-more-less-html__button")]',
        '//button[contains(@class, "artdeco-button") and .//span[contains(translate(., "MORE", "more"), "more")]]',
    ]
    for xp in variants:
        try:
            for b in drv.find_elements(By.XPATH, xp)[:4]:
                try:
                    drv.execute_script('arguments[0].scrollIntoView({block:"center"});', b); time.sleep(0.1)
                    try:
                        b.click()
                    except (ElementClickInterceptedException, StaleElementReferenceException):
                        drv.execute_script("arguments[0].click();", b)
                except Exception:
                    pass
        except Exception:
            pass

def _page_text(drv) -> str:
    try:
        return drv.execute_script("return document.body.innerText || '';") or ""
    except Exception:
        try:
            return drv.find_element(By.TAG_NAME, "body").text
        except Exception:
            return ""

def _extract_employment_type(drv, txt=None):
    texts = _all_texts(drv,
        css_list=[
            ".job-details__content",
            ".jobs-description__container",
            ".jobs-unified-top-card__primary-description-container",
            ".jobs-unified-top-card__subtitle-primary-grouping",
            "section.jobs-box",
        ])
    joined = " | ".join(texts).lower()
    for pat in _META_JOBTYPE_PATTERNS:
        m = re.search(pat, joined, flags=re.I)
        if m:
            return m.group(0).replace("-", " ").title()
    if txt:
        for pat in _META_JOBTYPE_PATTERNS:
            m = re.search(pat, txt, flags=re.I)
            if m:
                return m.group(0).replace("-", " ").title()
    return ""

_SAL_PATTERNS = [
    re.compile(r"(\$|£|€)\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:[KkMm])?\s*(?:[-–—]\s*(?:\$|£|€)?\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:[KkMm])?)?\s*(?:/|per\s+)?(year|yr|month|mo|hour|hr|week|wk|day|annum)\b", re.I),
    re.compile(r"(?:salary|pay|compensation)\s*[:\-]?\s*(\$|£|€)\s?\d{1,3}(?:,\d{3})*(?:\.\d+)?(?:\s*[Kk])?(?:\s*[-–—]\s*(?:\$|£|€)?\s?\d{1,3}(?:,\d{3})*)?", re.I),
    re.compile(r"\$?\s?\d{2,3}\s?(?:-\s?\$?\s?\d{2,3})\s?(?:per\s+hour|/hour|hr)\b", re.I),
]

def _extract_salary_from_text(txt: str) -> str:
    for pat in _SAL_PATTERNS:
        m = pat.search(txt)
        if m:
            return m.group(0).strip()
    return ""

def _extract_salary(drv, txt=None):
    containers = _all_texts(drv,
        css_list=[
            ".job-details__content", ".jobs-description__container",
            ".jobs-unified-top-card__job-insight", ".jobs-box__html-content",
            ".jobs-box--expandable-content", ".jobs-salary", ".jobs-pay"
        ])
    for block in containers:
        s = _extract_salary_from_text(block)
        if s:
            return s
    if txt:
        return _extract_salary_from_text(txt)
    return ""


def _clean_about(text: str, max_len=1500) -> str:
    t = re.sub(r"\n{2,}", "\n", text or "").strip()
    # soft-trim boilerplate tails if they appear
    t = re.split(r"(?:Equal Opportunity Employer|EEO|Pay Transparency|Benefits include)[:\s]", t, maxsplit=1)[0].strip()
    return t[:max_len]


def _extract_about_company_section(drv, txt=None, max_len=1500):
    """
    Robust extractor for the 'About the company' section on LinkedIn job pages.
    Strategy (in order):
      1) CSS container hunt for known company blocks (jobs-company / about IDs).
      2) Header match ('About the company' / 'About company' / 'About us' / 'Company overview'), then sibling-walk.
      3) JavaScript sibling-walk from the last matching H2/H3 heading.
      4) Whole-page text fallback starting at the header text.
    """
    def _clean_about(text: str) -> str:
        t = re.sub(r"\n{2,}", "\n", text or "").strip()
        # Remove obvious boilerplate tails
        t = re.split(r"(?:Interested in working|Members who share|Learn more\b|I['’]m interested|Equal Opportunity Employer|EEO|Pay Transparency)[:\s]", t, maxsplit=1)[0].strip()
        return t[:max_len]

    # 1) Known containers: try to read the block even if heading markup changes
    container_css = [
        "section.jobs-company", "div.jobs-company", "section.jobs-company__card",
        "div.jobs-company__box", "section.jobs-box--company",
        "section[data-test-id*='about']", "div[data-test-id*='about']",
        "section#about-company", "div#about-company",
    ]
    for sel in container_css:
        try:
            blocks = drv.find_elements(By.CSS_SELECTOR, sel)
            for b in blocks[::-1]:  # prefer later blocks
                txt_block = (b.text or "").strip()
                if not txt_block:
                    continue
                # Only accept if it actually contains a clue word
                if re.search(r"\babout\b.*\bcompany\b", txt_block, re.I) or re.search(r"\bfollowers\b|^follow$", txt_block, re.I):
                    # Strip the header portion if present
                    txt_block = re.sub(r"^.*?(about\s+the\s+company|about\s+company|about\s+us|company\s+overview)\s*:?[\n]+", "", txt_block, flags=re.I|re.S)
                    cleaned = _clean_about(txt_block)
                    if len(cleaned.split()) > 5:
                        return cleaned
        except Exception:
            pass

    # 2) XPath header hunt + sibling walk in Python
    header_xps = [
        '//h2[contains(translate(., "ABOUT THE COMPANY", "about the company"), "about the company")]',
        '//h2[contains(translate(., "ABOUT COMPANY", "about company"), "about company")]',
        '//h2[contains(translate(., "ABOUT US", "about us"), "about us")]',
        '//h2[contains(translate(., "COMPANY OVERVIEW", "company overview"), "company overview")]',
        '//h3[contains(translate(., "ABOUT THE COMPANY", "about the company"), "about the company")]',
        '//h3[contains(translate(., "ABOUT COMPANY", "about company"), "about company")]',
        '//h3[contains(translate(., "ABOUT US", "about us"), "about us")]',
        '//h3[contains(translate(., "COMPANY OVERVIEW", "company overview"), "company overview")]',
    ]
    def _sibling_collect_from(el):
        # Collect following siblings' visible text until next header
        para = []
        node = el
        # Try within same section first
        try:
            sec = el.find_element(By.XPATH, "ancestor::section")
        except Exception:
            sec = None
        if sec:
            kids = sec.find_elements(By.XPATH, ".//*[self::p or self::li or self::div or self::span]")
            for k in kids:
                try:
                    t = (k.text or "").strip()
                    if t and len(t.split()) > 4:
                        para.append(t)
                except Exception:
                    continue
        if para:
            return " ".join(para).strip()
        return ""

    for xp in header_xps:
        try:
            hdrs = drv.find_elements(By.XPATH, xp)
            if hdrs:
                text = _sibling_collect_from(hdrs[-1])
                if text:
                    return _clean_about(text)
        except Exception:
            continue

    # 3) JavaScript sibling-walk from the last matching H2/H3
    js = r"""
    const hdrs = Array.from(document.querySelectorAll('h2, h3')).filter(h => /about (the )?company|about us|company overview/i.test(h.innerText.trim()));
    if (!hdrs.length) return '';
    const h = hdrs[hdrs.length-1];
    const section = h.closest('section') || h.parentElement;
    if (section){
      Array.from(section.querySelectorAll('button')).forEach(b => {
        const t = (b.innerText || '').toLowerCase();
        if (t.includes('show more') || t.includes('see more') || t.includes('more')) { try{ b.click(); }catch(e){} }
      });
    }
    let texts = [];
    let node = h.nextSibling;
    const isHeaderEl = el => el && el.matches && el.matches('h2,h3');
    while(node){
      if (node.nodeType === Node.ELEMENT_NODE){
        const el = node;
        if (isHeaderEl(el)) break;
        const t = (el.innerText || '').trim();
        if (t) texts.push(t);
      } else if (node.nodeType === Node.TEXT_NODE){
        const t = (node.textContent || '').trim();
        if (t) texts.push(t);
      }
      node = node.nextSibling;
    }
    if (texts.join(' ').trim().length < 60 && section){
      let whole = (section.innerText || '').trim();
      whole = whole.replace(/^[\s\S]*?(about (the )?company|about us|company overview)\s*:?\s*/i,'');
      return whole;
    }
    return texts.join('\n').trim();
    """
    try:
        seg = drv.execute_script(js)
        if seg and len(seg.split()) > 5:
            return _clean_about(seg)
    except Exception:
        pass

    # 4) Whole-page text fallback
    if txt:
        low = txt.lower()
        for key in ["about the company", "about company", "about us", "company overview"]:
            pos = low.find(key)
            if pos != -1:
                seg = txt[pos: pos + max_len*2]
                lines = [ln.strip() for ln in seg.splitlines() if ln.strip()]
                stop_tokens = ["about the team", "job details", "responsibilities", "qualifications", "benefits", "pay", "salary"]
                for j, ln in enumerate(lines):
                    if any(tok in ln.lower() for tok in stop_tokens) and j > 2:
                        lines = lines[:j]; break
                return _clean_about(" ".join(lines))
    return ""



def _robust_scroll(drv):
    try:
        drv.execute_script("window.scrollTo(0, 0);"); time.sleep(0.1)
        h_last = 0
        for _ in range(12):
            drv.execute_script("window.scrollBy(0, window.innerHeight*0.85);"); time.sleep(0.2)
            h = drv.execute_script("return document.documentElement.scrollTop || document.body.scrollTop || 0;")
            if h == h_last: break
            h_last = h
    except Exception:
        pass


_WORKPLACE_PATTERNS = [
    r"\bremote\b", r"\bhybrid\b", r"\bon[\-\s]?site\b", r"\bonsite\b"
]

def _extract_workplace_type(drv, txt=None):
    # Prefer the explicit Workplace-type badge/label in the top card
    texts = _all_texts(drv,
        css_list=[
            ".jobs-unified-top-card__workplace-type",
            ".jobs-unified-top-card__primary-description-container li",
            ".jobs-unified-top-card__primary-description-container span",
            ".jobs-unified-top-card__subtitle-primary-grouping span",
            ".jobs-unified-top-card__subtitle-primary-grouping",
        ])
    def norm(val: str) -> str:
        v = val.lower()
        if "remote" in v: return "Remote"
        if "hybrid" in v: return "Hybrid"
        if "on-site" in v or "onsite" in v or "on site" in v: return "On-site"
        return ""
    joined = " | ".join(texts)
    for pat in _WORKPLACE_PATTERNS:
        m = re.search(pat, joined, flags=re.I)
        if m:
            w = norm(m.group(0))
            if w: return w
    if txt:
        for pat in _WORKPLACE_PATTERNS:
            m = re.search(pat, txt, flags=re.I)
            if m:
                w = norm(m.group(0))
                if w: return w
    return ""


def augment_with_job_meta(df, per_job_delay=0.6, limit=None, dump_debug=False):
    global driver
    if df.empty or "Link" not in df.columns:
        return df

    out = df.copy()
    # Ensure target columns exist
    for col, default in [("Salary",""), ("EmploymentType",""), ("AboutTheCompany",""), ("WorkplaceType","")]:
        if col not in out.columns:
            out[col] = default
    # Optionally drop 'Applicants' if present
    if "Applicants" in out.columns:
        try:
            out.drop(columns=["Applicants"], inplace=True)
        except Exception:
            pass

    _n = len(out) if limit is None else min(limit, len(out))
    wait = WebDriverWait(driver, 15)

    for i in range(_n):
        link = out.at[i, "Link"]
        if not link or not isinstance(link, str):
            continue
        try:
            driver = ensure_window(driver)
            driver = safe_get(driver, link)
            try:
                wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".jobs-unified-top-card")))
            except TimeoutException:
                pass

            _click_show_more_if_present(driver)
            _robust_scroll(driver)
            page_text = _page_text(driver)

            emp = _extract_employment_type(driver, page_text)
            sal = _extract_salary(driver, page_text)
            about = _extract_about_company_section(driver, page_text)
            wpt = _extract_workplace_type(driver, page_text)

            if emp: out.at[i, "EmploymentType"] = emp
            if sal: out.at[i, "Salary"] = sal
            if about: out.at[i, "AboutTheCompany"] = about
            if wpt: out.at[i, "WorkplaceType"] = wpt


        except Exception as e:
            print(f"⚠️ Meta extraction failed for row {i}: {e}")
        finally:
            time.sleep(max(per_job_delay, 0.8))

    return out


In [None]:
driver = ensure_window(driver); driver = safe_get(driver, "https://www.linkedin.com/jobs/")

final_df = scrape_and_parse_linkedin_jobs(
    nb_jobs=40000,                          # Adjust the number of jobs to scrape
    keywords="Computer Science",            # Adjust the keywords to search for
    location="United States",               # Adjust the location to search in
    experience=ExperienceFilter.ALL,
    posted=JobPostTime.ANY_TIME             # Adjust the posting time filter
)

Scraping page 1 (need 10 jobs from this page)


In [13]:
final_df = augment_with_job_meta(final_df, per_job_delay=0.8, limit=None, dump_debug=False)

In [14]:
import re
from pandas.tseries.offsets import DateOffset

# Get correct job locations from mixed information
parts = final_df["Location"].fillna("").str.replace("\n", " · ", regex=False).str.split("·")
final_df["Location"] = parts.str[0].str.strip()
final_df["Applicants"] = parts.apply(lambda xs: next(((m.group(0).strip()) for s in xs if (m:=re.search(r'(?i)(?:over\s+)?\d[\d,]*\s+(?:people\s+(?:clicked\s+apply|appl(?:y|ied))|applicants?)', s.strip()))), None))
final_df['Description'] = final_df['Description'].str.replace('About the job\n','').str.strip()
final_df['AboutTheCompany'] = final_df['AboutTheCompany'].str.replace(r'^\s*(?:.*\b(?:follow|followers)\b.*\n)*\s*(?:.*\bemployees\b.*\bon\s+LinkedIn\b.*\n)?', '', regex=True).str.strip()

final_df["Posted_text"] = parts.apply(
    lambda xs: next((t.strip() for t in xs if isinstance(xs, list) and ("ago" in t.lower() or "just now" in t.lower())), "")
)

# The jobs posted time in LinkedIn usually looks like "xx days/months/years ago", so we need to use the today's time to get specific date
def to_today_minus(s: str) -> str:
    s = (s or "").lower().strip()
    if "just now" in s:
        return "today"
    m = re.search(r"(\d+)", s)
    if m and "day" in s:
        n = int(m.group(1))
        return "today - " + str(n) + ("day" if n == 1 else "days")
    return ""

final_df["Posted_relative"] = final_df["Posted_text"].apply(to_today_minus)


# This function allow us to use Chicago time to show the posted date in form of "mm/dd/yyyy"
def to_posted_date(s: str, now=None, tz: str = "America/Chicago"):
    if now is None:
        now = pd.Timestamp.now(tz)  # current time with timezone

    s = (str(s) or "").lower().strip()
    if not s:
        return pd.NaT
    if "just now" in s:
        return now.normalize().strftime("%Y/%m/%d")

    m = re.search(r"(\d+)\s*(minute|hour|day|week|month|year)s?\s*ago", s)
    if not m:
        return pd.NaT

    n = int(m.group(1))
    unit = m.group(2)

    if unit == "minute":
        dt = now - pd.Timedelta(minutes=n)
    elif unit == "hour":
        dt = now - pd.Timedelta(hours=n)
    elif unit == "day":
        dt = now - pd.Timedelta(days=n)
    elif unit == "week":
        dt = now - pd.Timedelta(weeks=n)
    elif unit == "month":
        dt = now - DateOffset(months=n)   
    elif unit == "year":
        dt = now - DateOffset(years=n)    
    else:
        return pd.NaT
    return dt.normalize().strftime("%Y/%m/%d")

In [15]:
final_df["Posted_date"] = final_df["Posted_text"].apply(to_posted_date)
final_df.drop(columns=["Posted_text", "Posted_relative"], inplace=True)

In [None]:
final_df.to_csv("linkedin_jobs.csv", index=False)