# LinkedIn job search (guest endpoint)

This notebook uses LinkedIn’s public “guest” search endpoint to collect job listings. Automated access to LinkedIn may be restricted by their Terms of Service—use responsibly, keep request rates low, and prefer official APIs or approved data sources whenever possible.

In [1]:
# Search parameters
job_titles = ["data scientist", "machine learning engineer"]
skills = ["python", "git"]
industry = ["energy", "oil & gas"]

# Options: "past 24 hours", "past week", "past month", "any time"
job_post_date = "past 24 hours"

# Optional filters
location = "United States"  # change to your preferred location
max_results_per_title = 50   # cap per job title
pause_seconds = 0.6          # polite delay between requests
include_description = False  # set True to fetch job descriptions (slower)

In [2]:
# If needed, install deps: pip install requests beautifulsoup4 pandas

from __future__ import annotations

import re
import time
from typing import Iterable, List, Dict
from urllib.parse import urlencode

import requests
from bs4 import BeautifulSoup
import pandas as pd

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    )
}

TIME_FILTERS = {
    "past 24 hours": "r86400",
    "past week": "r604800",
    "past month": "r2592000",
    "any time": "",
}


def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text or "").strip().lower()


def build_query(job_title: str, skills: List[str], industries: List[str]) -> str:
    parts = [job_title] + (skills or []) + (industries or [])
    return " ".join(p for p in parts if p).strip()


def build_url(query: str, location: str, start: int, time_filter: str) -> str:
    params = {
        "keywords": query,
        "location": location,
        "start": start,
    }
    tpr = TIME_FILTERS.get(time_filter.lower(), "")
    if tpr:
        params["f_TPR"] = tpr
    return "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)


def fetch_jobs_page(url: str) -> str:
    resp = requests.get(url, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    return resp.text


def parse_jobs(html: str) -> List[Dict[str, str]]:
    soup = BeautifulSoup(html, "html.parser")
    cards = soup.select("li")
    results: List[Dict[str, str]] = []

    for card in cards:
        title_el = card.select_one("h3")
        company_el = card.select_one("h4")
        location_el = card.select_one(".job-search-card__location")
        time_el = card.select_one("time")
        link_el = card.select_one("a")
        urn = card.get("data-entity-urn", "")

        title = title_el.get_text(strip=True) if title_el else ""
        company = company_el.get_text(strip=True) if company_el else ""
        location = location_el.get_text(strip=True) if location_el else ""
        date = time_el.get_text(strip=True) if time_el else ""
        link = link_el.get("href", "") if link_el else ""
        job_id = urn.split(":")[-1] if urn else ""

        if not title and not company:
            continue

        results.append(
            {
                "job_id": job_id,
                "title": title,
                "company": company,
                "location": location,
                "date_posted": date,
                "link": link,
            }
        )

    return results


def fetch_description(job_url: str) -> str:
    if not job_url:
        return ""
    resp = requests.get(job_url, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    desc_el = soup.select_one(".description__text, .show-more-less-html__markup")
    return desc_el.get_text(" ", strip=True) if desc_el else ""

ModuleNotFoundError: No module named 'pandas'

In [None]:
def search_linkedin_jobs(
    job_titles: List[str],
    skills: List[str],
    industries: List[str],
    job_post_date: str,
    location: str,
    max_results_per_title: int = 50,
    pause_seconds: float = 0.6,
    include_description: bool = False,
) -> pd.DataFrame:
    all_results: List[Dict[str, str]] = []

    for job_title in job_titles:
        query = build_query(job_title, skills, industries)
        collected = 0
        start = 0

        while collected < max_results_per_title:
            url = build_url(query=query, location=location, start=start, time_filter=job_post_date)
            html = fetch_jobs_page(url)
            batch = parse_jobs(html)

            if not batch:
                break

            if include_description:
                for row in batch:
                    row["description"] = fetch_description(row.get("link", ""))

            all_results.extend(batch)
            collected += len(batch)
            start += len(batch)

            time.sleep(pause_seconds)

    df = pd.DataFrame(all_results)

    if df.empty:
        return df

    # Keyword-based filtering
    def match_any(haystack: str, needles: Iterable[str]) -> bool:
        needles = [n for n in (needles or []) if n]
        if not needles:
            return True
        return any(n in haystack for n in needles)

    text_cols = ["title", "company", "location", "description"]
    df["_haystack"] = (
        df[text_cols]
        .fillna("")
        .agg(" ".join, axis=1)
        .map(normalize)
    )

    skill_terms = [normalize(s) for s in skills]
    industry_terms = [normalize(i) for i in industries]

    df = df[df["_haystack"].map(lambda t: match_any(t, skill_terms))]
    df = df[df["_haystack"].map(lambda t: match_any(t, industry_terms))]

    df = df.drop(columns=["_haystack"]).drop_duplicates(subset=["job_id", "link", "title", "company"])
    return df


results = search_linkedin_jobs(
    job_titles=job_titles,
    skills=skills,
    industries=industry,
    job_post_date=job_post_date,
    location=location,
    max_results_per_title=max_results_per_title,
    pause_seconds=pause_seconds,
    include_description=include_description,
)

results.head(10)

In [None]:
# Save results
output_path = "linkedin_jobs.csv"
results.to_csv(output_path, index=False)

print(f"Saved {len(results)} rows to {output_path}")