In [85]:
import time, random, re, urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from getpass import getpass
from selenium.webdriver.common.keys import Keys

In [86]:
USERNAME = "zh272@georgetown.edu"
PASSWORD = getpass("🔐 Enter LinkedIn Password: ")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Go to LinkedIn login page
driver.get("https://www.linkedin.com/login")
time.sleep(2)

# Enter email and password
driver.find_element(By.ID, "username").send_keys(USERNAME)
driver.find_element(By.ID, "password").send_keys(PASSWORD + Keys.RETURN)
time.sleep(5)

# Verify login
if "feed" in driver.current_url or "jobs" in driver.current_url:
    print("✅ Login successful!")
else:
    print("❌ Login may have failed.")

✅ Login successful!


In [87]:
import itertools

experience_levels = {
"Internship": "1", "Entry level": "2", "Associate": "3",
"Mid-Senior": "4", "Director": "5", "Executive": "6"
}


remote_types = {
"On-site": "1", "Hybrid": "2", "Remote": "3"
}


date_posted = {
"Any time": None,"Past week": "r604800", "Past month": "r2592000"
}


salary_ranges = {
"$40K+": "1", "$60K+": "2", "$80K+": "3", "$100K+": "4", "$120K+": "5"
}


# --- BASE URL ---
base_url = "https://www.linkedin.com/jobs/search/?"


# --- FIXED PARAMETERS ---
fixed_params = {
"keywords": "data center",
"location": "United States",
"sortBy": "R"
}

combinations = list(itertools.product(
experience_levels.items(),
remote_types.items(),
date_posted.items(),
salary_ranges.items()
))

print(f"Generating {len(combinations)} combinations...")

Generating 270 combinations...


In [88]:
import urllib.parse

generated_urls = []

for exp, remote, date, salary in combinations:
    exp_label, exp_val = exp
    remote_label, remote_val = remote
    date_label, date_val = date
    salary_label, salary_val = salary
    query_params = fixed_params.copy()
    query_params.update({
        "f_E": exp_val,
        "f_WT": remote_val,
        "f_TPR": date_val,
        "f_SB2": salary_val
    })
    final_url = base_url + urllib.parse.urlencode(query_params)
    generated_urls.append(final_url)


print(f"Successfully generated {len(generated_urls)} URLs.")

Successfully generated 270 URLs.


In [89]:
CONTAINER_SELECTORS = [
    "ul.scaffold-layout__list-container",
    "div.jobs-search-results-list__container",
    "div.jobs-search-two-pane__results",
    "section.two-pane-serp-page__results-list",
]
ITEM_SELECTORS = [
    "li.scaffold-layout__list-item",
    "li[data-occludable-job-id]",
    "ul.jobs-search-results__list > li",
    "div.jobs-search-results-list__list-item",
]

def _sleep(a=0.35, b=0.85): time.sleep(random.uniform(a,b))

def _first_el(root, selectors, by_css=True, timeout=0):
    if timeout:
        for sel in selectors:
            try:
                if by_css:
                    return WebDriverWait(root, timeout).until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
                else:
                    return WebDriverWait(root, timeout).until(EC.presence_of_element_located((By.XPATH, sel)))
            except TimeoutException:
                continue
        return None
    else:
        for sel in selectors:
            try:
                els = root.find_elements(By.CSS_SELECTOR if by_css else By.XPATH, sel)
                if els: return els[0]
            except Exception:
                pass
        return None

def _first_text(root, selectors):
    el = _first_el(root, selectors, by_css=True, timeout=0)
    try: return (el.text or "").strip()
    except: return ""

def _first_attr(root, selectors, attr):
    el = _first_el(root, selectors, by_css=True, timeout=0)
    try: return el.get_attribute(attr) or ""
    except: return ""

def wait_results_loaded(driver, timeout=12):
    for sel in CONTAINER_SELECTORS:
        try:
            WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
            return True
        except TimeoutException:
            continue
    return False

def expand_all_in(driver, root=None, max_clicks=8):
    scope = root or driver
    # Both the old & new “See more” buttons
    sels = [
        "button.show-more-less-html__button--more",
        "button.show-more-less-html__button",
        "button[aria-expanded='false']",
    ]
    clicks = 0
    while clicks < max_clicks:
        btn = _first_el(scope, sels, by_css=True, timeout=0)
        if not btn: break
        try:
            btn.click()
        except (ElementClickInterceptedException, StaleElementReferenceException):
            try:
                driver.execute_script("arguments[0].click();", btn)
            except Exception:
                break
        _sleep(0.2, 0.4)
        clicks += 1

def get_title_company(driver):
    title = ""
    company = ""
    try:
        title = WebDriverWait(driver, 8).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.top-card-layout__title, h1.topcard__title"))
        ).text.strip()
    except TimeoutException:
        pass
    comp_el = _first_el(driver, ["a.topcard__org-name-link", "span.topcard__flavor"], timeout=0)
    if comp_el:
        try: company = comp_el.text.strip()
        except: pass
    return title, company

def get_description_root(driver):
    for sel in ["#job-details", ".jobs-description__content", ".jobs-box__html-content", ".show-more-less-html__markup"]:
        found = driver.find_elements(By.CSS_SELECTOR, sel)
        if found: return found[0]
    return None

def get_salary(driver):
    # Heuristic scan near the top card area first, then global
    areas = [
        ".jobs-unified-top-card", ".job-view-layout", "section.jobs-unified-top-card__primary-description"
    ]
    for area in areas:
        for el in driver.find_elements(By.CSS_SELECTOR, area+" *"):
            try:
                t = (el.text or "").strip()
                if "$" in t or "/hr" in t or "per year" in t or "per hour" in t or "salary" in t.lower():
                    return t
            except: pass
    # fallback
    try:
        t = driver.find_element(By.XPATH, "//*[self::span or self::div][contains(text(),'$') or contains(.,'/hr') or contains(.,'per year') or contains(.,'per hour') or contains(translate(., 'SALARY', 'salary'),'salary')]").text
        return t.strip()
    except Exception:
        return ""

# Split to requested sections
SECTION_HEADERS = [
    ("Responsibilities", ["responsibilities","duties","what you will do","what you’ll do"]),
    ("QualificationsRequired", ["required qualifications","basic qualifications","minimum qualifications","requirements"]),
    ("QualificationsPreferred", ["preferred qualifications","nice to have","preferred"]),
    ("Requirements", ["requirements"]),  # catch-all
]
def split_sections(desc):
    out = {"Description": desc or "", "Responsibilities":"", "QualificationsRequired":"", "QualificationsPreferred":"", "Requirements":""}
    txt = desc or ""
    low = txt.lower()
    hits = []
    for col, keys in SECTION_HEADERS:
        for k in keys:
            i = low.find(k)
            if i != -1:
                hits.append((i, col, k))
                break
    if not hits: return out
    hits.sort(key=lambda x: x[0])
    for idx, (start, col, key) in enumerate(hits):
        end = hits[idx+1][0] if idx+1 < len(hits) else len(txt)
        seg = txt[start:end].strip()
        seg = re.sub(rf"(?is)^{re.escape(key)}\s*:?\s*", "", seg, count=1)
        out[col] = seg.strip()
    return out

In [90]:
def scrape_search_url_to_rows(driver, url, max_pages=40, per_page_limit=None):
    rows = []
    for start in range(0, max_pages*25, 25):
        # build paginated URL
        u = urllib.parse.urlparse(url)
        q = dict(urllib.parse.parse_qsl(u.query, keep_blank_values=True))
        q["start"] = str(start)
        page_url = urllib.parse.urlunparse((u.scheme, u.netloc, u.path, u.params,
                                            urllib.parse.urlencode(q, doseq=True), u.fragment))
        driver.get(page_url)

        if not wait_results_loaded(driver, timeout=12):
            break

        # find job cards
        cards = []
        for sel in ITEM_SELECTORS:
            cards = driver.find_elements(By.CSS_SELECTOR, sel)
            if cards: break
        if not cards:
            break

        list_handle = driver.current_window_handle
        taken = 0

        for job in cards:
            try:
                link_el = _first_el(job, [
                    "a.base-card__full-link", 
                    "a.job-card-container__link", 
                    "a[href*='/jobs/view/']"
                ], timeout=0)
                if not link_el: 
                    continue

                link = link_el.get_attribute("href") or ""
                if not link:
                    continue

                # 🟢 open full job in a new tab
                driver.execute_script("window.open(arguments[0], '_blank');", link)
                WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
                driver.switch_to.window(driver.window_handles[-1])

                WebDriverWait(driver, 12).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".jobs-unified-top-card, h1.top-card-layout__title"))
                )

                expand_all_in(driver, None, max_clicks=6)

                joburl = (driver.current_url or "").split("?")[0]
                title, company = get_title_company(driver)
                desc_root = get_description_root(driver)
                desc = desc_root.text.strip() if desc_root else ""
                salary = get_salary(driver)

                secs = split_sections(desc)

                rows.append({
                    "JobTitle": title,
                    "Company": company,
                    "JobURL": joburl,
                    "SearchURL": url,
                    "Description": secs.get("Description",""),
                    "Responsibilities": secs.get("Responsibilities",""),
                    "QualificationsRequired": secs.get("QualificationsRequired",""),
                    "QualificationsPreferred": secs.get("QualificationsPreferred",""),
                    "Requirements": secs.get("Requirements",""),
                    "Salary": salary,
                })

                # close tab, return to results
                driver.close()
                driver.switch_to.window(list_handle)
                _sleep()

                taken += 1
                if per_page_limit and taken >= per_page_limit:
                    break

            except Exception:
                try:
                    if driver.current_window_handle != list_handle:
                        driver.close()
                        driver.switch_to.window(list_handle)
                except:
                    pass
                continue

        if len(cards) < 10:
            break
        _sleep(0.9, 1.6)
    return rows


In [91]:
def scrape_and_collect(driver, urls, max_pages=40, per_page_limit=None):
    """
    Scrape LinkedIn jobs from a list of search URLs.
    Returns a Pandas DataFrame with the results.
    (CSV writing removed — you can export later.)
    """
    all_rows = []
    for u in urls:
        rows = scrape_search_url_to_rows(
            driver, u, max_pages=max_pages, per_page_limit=per_page_limit
        )
        all_rows.extend(rows)
        _sleep(1.0, 2.0)

    df = pd.DataFrame(all_rows)
    if "JobURL" in df.columns:
        df = df.drop_duplicates(subset=["JobURL"], keep="first")
    print(f"Collected {len(df)} rows from {len(urls)} search URLs")
    return df

In [92]:
df = scrape_and_collect(driver, [generated_urls[0]], max_pages=1)

Collected 0 rows from 1 search URLs


In [93]:
df