In [None]:
import urllib.parse
import requests
from bs4 import BeautifulSoup
import re
import time

In [None]:
def build_linkedin_search_url(job_title, location, start=0):
    """
    Constructs a properly formatted LinkedIn job search URL.

    Args:
        job_title: Job title to search for (e.g., "Software Engineer")
        location: Location to search in (e.g., "San Francisco, CA" or "remote")
        start: Pagination offset (0, 25, 50, etc.)

    Returns:
        Properly formatted and URL-encoded LinkedIn job search URL
    """
    base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"

    params = {
        "keywords": job_title,
        "location": location,
        "start": start
    }

    query_string = urllib.parse.urlencode(params)
    return f"{base_url}?{query_string}"


def test_linkedin_connection(url):
    """
    Tests HTTP connectivity to LinkedIn job search URL.

    Args:
        url: The URL to test

    Returns:
        Tuple of (status_code, response_length)
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.linkedin.com/"
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        return response.status_code, len(response.text)
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to LinkedIn: {e}")
        return None, 0


In [None]:
url = build_linkedin_search_url("Data Scientist", "remote")
print(f"Generated URL: {url}\n")

status, length = test_linkedin_connection(url)

if status:
    print(f"Status Code: {status}")
    print(f"Response Length: {length} characters")

    # Verify expectations
    if status == 200 and length > 1000:
        print("\n✓ Connection test passed!")
    else:
        print(f"\n✗ Connection test failed - Status: {status}, Length: {length}")
else:
    print("✗ Connection failed")


In [None]:
def parse_job_cards(html_content):
    """
    Parses LinkedIn job search HTML to extract job listings.
    
    Args:
        html_content: HTML string from LinkedIn job search
    
    Returns:
        List of dictionaries containing job information
    """
    jobs = []
    
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find all job card list items
        job_cards = soup.find_all('li')
        
        for card in job_cards:
            try:
                # Extract job title
                title_elem = card.find(class_='base-search-card__title')
                job_title = title_elem.get_text(strip=True) if title_elem else "N/A"
                
                # Extract company name
                company_elem = card.find(class_='base-search-card__subtitle')
                company = company_elem.get_text(strip=True) if company_elem else "N/A"
                
                # Extract location
                location_elem = card.find(class_='job-search-card__location')
                location = location_elem.get_text(strip=True) if location_elem else "N/A"
                
                # Extract posted date
                posted_elem = card.find(class_='job-search-card__listdate')
                posted_ago = posted_elem.get_text(strip=True) if posted_elem else "N/A"
                
                # Extract job URL
                link_elem = card.find('a', class_='base-card__full-link')
                job_url = link_elem.get('href', 'N/A') if link_elem else "N/A"
                
                # Extract job ID from URL
                job_id = None
                if job_url != "N/A":
                    match = re.search(r'/jobs/view/(\d+)', job_url)
                    if match:
                        job_id = match.group(1)
                
                # Only add if we have at least a title and URL
                if job_title != "N/A" and job_url != "N/A":
                    jobs.append({
                        'job_id': job_id,
                        'job_title': job_title,
                        'company': company,
                        'location': location,
                        'posted_ago': posted_ago,
                        'job_url': job_url
                    })
                    
            except Exception as e:
                # Skip individual cards that fail to parse
                print(f"Warning: Failed to parse a job card: {e}")
                continue
        
    except Exception as e:
        print(f"Error parsing HTML content: {e}")
        return []
    
    return jobs

In [None]:
def fetch_jobs(title: str):
    # Test with real request: Fetch Software Engineer jobs in remote
    test_url = build_linkedin_search_url(title, "remote")
    print(f"Fetching jobs from: {test_url}\n")

    # Make the request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.linkedin.com/"
    }

    try:
        response = requests.get(test_url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            print(f"✓ Successfully fetched data (Status: {response.status_code})\n")
            
            # Parse the jobs
            jobs = parse_job_cards(response.text)
            
            print(f"Found {len(jobs)} jobs\n")
            print("=" * 80)
            print("First 3 jobs:")
            print("=" * 80)
            
            for i, job in enumerate(jobs, 1):
                print(f"\n{i}. {job['job_title']}")
                print(f"   Company: {job['company']}")
                print(f"   Location: {job['location']}")
                print(f"   Posted: {job['posted_ago']}")
                print(f"   Job ID: {job['job_id']}")
                print(f"   URL: {job['job_url']}..." if len(job['job_url']) > 80 else f"   URL: {job['job_url']}")
                
                try:
                    # Verify job_url and job_id exist
                    assert job['job_url'] is not None and job['job_url'] != "N/A", f"Job {i} missing URL"
                    assert job['job_id'] is not None, f"Job {i} missing ID"
                except:
                    continue
            
            print("\n" + "=" * 80)
            print("✓ All tests passed! All jobs have valid URLs and IDs.")
            
        else:
            print(f"✗ Failed to fetch data (Status: {response.status_code})")
            
    except Exception as e:
        print(f"✗ Error during test: {e}")

In [None]:
fetch_jobs("Machine Learning Engineer")

In [None]:
class JobStorage:
    """
    In-memory storage for job data with description management.
    """
    
    def __init__(self):
        self.jobs = []
    
    def add_jobs(self, job_list):
        """
        Store jobs in internal list.
        
        Args:
            job_list: List of job dictionaries
        """
        for job in job_list:
            # Ensure each job has a description field
            if 'description' not in job:
                job['description'] = None
            self.jobs.append(job)
    
    def get_all_jobs(self):
        """
        Return all stored jobs.
        
        Returns:
            List of all job dictionaries
        """
        return self.jobs
    
    def get_jobs_without_description(self):
        """
        Return jobs where description is None.
        
        Returns:
            List of jobs without descriptions
        """
        return [job for job in self.jobs if job.get('description') is None]
    
    def update_job_description(self, job_id, description):
        """
        Update specific job's description.
        
        Args:
            job_id: Job ID to update
            description: Description text to set
        """
        for job in self.jobs:
            if job.get('job_id') == job_id:
                job['description'] = description
                return True
        return False
    
    def get_stats(self):
        """
        Return statistics about stored jobs.
        
        Returns:
            Dictionary with job statistics
        """
        total = len(self.jobs)
        with_desc = sum(1 for job in self.jobs if job.get('description') and job.get('description') != "ERROR")
        without_desc = sum(1 for job in self.jobs if job.get('description') is None)
        errors = sum(1 for job in self.jobs if job.get('description') == "ERROR")
        
        return {
            'total_jobs': total,
            'jobs_with_descriptions': with_desc,
            'jobs_without_descriptions': without_desc,
            'jobs_with_errors': errors
        }

In [None]:
url

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://www.linkedin.com/jobs/"
}

try:
    session = requests.Session()
    response = session.get(url, headers=headers, timeout=10)
    print(f"Status: {response.status_code}")
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        desc_elem = (
            soup.find('div', class_='show-more-less-html__markup') or
            soup.find('div', class_='description__text') or
            soup.find('section', class_='description') or
            soup.find('div', class_='description') or
            soup.find('article', class_='jobs-description__container') or
            soup.find('div', class_=lambda x: x and 'jobs-description__content' in x) or
            soup.find('div', class_=re.compile(r'job.?description', re.I))
        )
        
        if desc_elem:
            print(desc_elem, desc_elem.get_text(strip=True))
            # return desc_elem.get_text(strip=True)
    
    # return None
    
except Exception as e:
    print(f"Error: {e}")
    # return None

In [None]:
def fetch_job_description(job_url):
    """
    Fetch job description from LinkedIn job URL.
    
    Args:
        job_url: URL to the job posting
    
    Returns:
        Job description text or None if fetch fails
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.linkedin.com/"
    }
    
    try:
        response = requests.get(job_url, headers=headers, timeout=10)
        
        print(response)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            print("nigga 2")
            # Try multiple possible selectors for job description
            desc_elem = (
                soup.find('div', class_='show-more-less-html__markup') or
                soup.find('div', class_='description__text') or
                soup.find('section', class_='description') or
                soup.find('div', class_='description') or
                soup.find('article',class_='jobs-description__container jobs-description__container--condensed') or 
                soup.find('div',class_='jobs-description__content jobs-description-content jobs-description__content--condensed') or
                soup.find('div', class_=re.compile(r'^job_description'))

            )

            print("nigga")
            
            if desc_elem:
                description = desc_elem.get_text(strip=True)
                return description if description else None
        
        return response
        
    except Exception as e:
        print(f"Error fetching description: {e}")
        return None

In [None]:
url = "https://www.linkedin.com/jobs/view/data-scientist-data-analytics-%E2%80%93-customer-loyalty-marketing-at-circle-k-4313197449"

In [None]:
desc = fetch_job_description('https://www.linkedin.com/jobs/view/data-scientist-data-analytics-%E2%80%93-customer-loyalty-marketing-at-circle-k-4316137579/?position=3&pageNum=0&refId=r8vm%2BrG%2B0ZTbNredBK8TzQ%3D%3D&trackingId=brKmbAor271tMuHOwExs2Q%3D%3D')

In [None]:
type(desc)

In [None]:
def fetch_all_descriptions(storage, delay=3):
    """
    Fetch descriptions for all jobs without descriptions.
    
    Args:
        storage: JobStorage instance
        delay: Seconds to wait between requests (default: 3)
    
    Returns:
        Count of successful fetches
    """
    jobs_to_fetch = storage.get_jobs_without_description()
    total = len(jobs_to_fetch)
    successful = 0
    
    print(f"Fetching descriptions for {total} jobs...\n")
    
    for i, job in enumerate(jobs_to_fetch, 1):
        job_title = job.get('job_title', 'Unknown')
        job_id = job.get('job_id')
        job_url = job.get('job_url')
        
        print(f"[{i}/{total}] Fetching description for: {job_title}...")
        
        try:
            description = fetch_job_description(job_url)
            
            if description:
                storage.update_job_description(job_id, description)
                successful += 1
                print(f"  ✓ Success ({len(description)} chars)")
            else:
                storage.update_job_description(job_id, "ERROR")
                print(f"  ✗ Failed to extract description")
                
        except Exception as e:
            storage.update_job_description(job_id, "ERROR")
            print(f"  ✗ Error: {e}")
        
        # Sleep between requests to be polite
        if i < total:
            time.sleep(delay)
    
    print(f"\nCompleted: {successful}/{total} descriptions fetched successfully")
    return successful

In [None]:
def scrape_multiple_pages(job_title, location, time_filter=None, limit=25):
    """
    Scrape multiple pages of LinkedIn jobs.
    
    Args:
        job_title: Job title to search
        location: Location to search
        time_filter: Not implemented yet (placeholder)
        limit: Maximum number of jobs to fetch
    
    Returns:
        List of job dictionaries (without descriptions)
    """
    all_jobs = []
    start = 0
    
    while len(all_jobs) < limit:
        url = build_linkedin_search_url(job_title, location, start)
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Referer": "https://www.linkedin.com/"
        }
        
        try:
            response = requests.get(url, headers=headers, timeout=10)
            
            if response.status_code == 200:
                jobs = parse_job_cards(response.text)
                
                if not jobs:
                    # No more jobs found
                    break
                
                all_jobs.extend(jobs)
                
                # Stop if we've reached the limit
                if len(all_jobs) >= limit:
                    all_jobs = all_jobs[:limit]
                    break
                
                start += 25
                time.sleep(2)  # Be polite between page requests
            else:
                print(f"Failed to fetch page at start={start}")
                break
                
        except Exception as e:
            print(f"Error fetching page: {e}")
            break
    
    return all_jobs

In [None]:
def scrape_with_storage(job_title, location, time_filter=None, limit=25):
    """
    Main workflow: Scrape jobs and fetch descriptions using storage.
    
    Args:
        job_title: Job title to search
        location: Location to search
        time_filter: Time filter (not implemented)
        limit: Maximum number of jobs to fetch
    
    Returns:
        List of all jobs with descriptions
    """
    # Initialize storage
    storage = JobStorage()
    
    print(f"Scraping {limit} jobs for '{job_title}' in '{location}'...\n")
    
    # Scrape job cards (without descriptions)
    jobs = scrape_multiple_pages(job_title, location, time_filter, limit)
    
    # Add to storage
    storage.add_jobs(jobs)
    
    # Print initial stats
    stats = storage.get_stats()
    print(f"\n{'='*80}")
    print(f"Found {stats['total_jobs']} jobs, fetching descriptions...")
    print(f"{'='*80}\n")
    
    # Fetch all descriptions
    fetch_all_descriptions(storage, delay=2)
    
    # Print final stats
    final_stats = storage.get_stats()
    print(f"\n{'='*80}")
    print("Final Statistics:")
    print(f"  Total jobs: {final_stats['total_jobs']}")
    print(f"  With descriptions: {final_stats['jobs_with_descriptions']}")
    print(f"  Without descriptions: {final_stats['jobs_without_descriptions']}")
    print(f"  Errors: {final_stats['jobs_with_errors']}")
    print(f"{'='*80}\n")
    
    return storage.get_all_jobs()

In [None]:
jobs

In [None]:
# Test: Scrape 10 jobs with descriptions
print("Testing JobStorage with 10 Data Scientist jobs...\n")

# Test the complete workflow
jobs = scrape_with_storage("Data Scientist", "remote", "past week", 10)

# Verify results
print("\nVerification:")
print(f"✓ Total jobs returned: {len(jobs)}")

jobs_with_desc = sum(1 for j in jobs if j.get('description') and j['description'] != "ERROR")
print(f"✓ Jobs with descriptions: {jobs_with_desc}")

jobs_with_errors = sum(1 for j in jobs if j.get('description') == "ERROR")
print(f"✓ Jobs with errors: {jobs_with_errors}")

# Show sample job
if jobs:
    print(f"\n{'='*80}")
    print("Sample Job (first one):")
    print(f"{'='*80}")
    sample = jobs[0]
    print(f"Title: {sample['job_title']}")
    print(f"Company: {sample['company']}")
    print(f"Location: {sample['location']}")
    print(f"Posted: {sample['posted_ago']}")
    print(f"Job ID: {sample['job_id']}")
    
    desc = sample.get('description', 'N/A')
    if desc and desc != "ERROR":
        desc_preview = desc[:200] + "..." if len(desc) > 200 else desc
        print(f"Description preview: {desc_preview}")
    else:
        print(f"Description: {desc}")

In [None]:
jobs 


In [None]:
!pip install selenium

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options  

In [None]:
pip install -U python-jobspy

In [None]:
!python --version

In [None]:
import csv
from jobspy import scrape_jobs

jobs = scrape_jobs(
    site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
    search_term="software engineer",
    google_search_term="software engineer jobs near Austin, TX",
    location="Austin, TX",
    results_wanted=20,
    hours_old=72,
    country_indeed='USA',
    
    # linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
    # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
)
print(f"Found {len(jobs)} jobs")
print(jobs.head())
jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False) # to_excel

In [None]:
jobs["job_url"][:1]

In [None]:
jobs

In [None]:
"""
linkedin_jobs_scraper.py
Polite Selenium scraper for public LinkedIn job search pages.
WARNING: Do not use to bypass protections. Check robots.txt and ToS.
"""

import json
import time
import random
import csv
from pathlib import Path
from typing import List, Dict, Optional

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ---- CONFIG ----
HEADLESS = False  # running non-headless may reduce chances of blocking during development
MAX_PAGES = 5     # how many pages of results to fetch (be conservative)
DELAY_MIN = 4.0
DELAY_MAX = 8.0
OUTPUT_FILE = "jobs.json"

# Example LinkedIn job search URL (replace query/location as needed)
# You can create the URL by using LinkedIn job search and copying the URL from your browser.
# Example structure: "https://www.linkedin.com/jobs/search/?keywords=data%20scientist&location=United%20States"
SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords=software%20engineer&location=United%20States"

# ---- helpers ----
def random_delay():
    time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))

def build_driver(headless: bool = HEADLESS):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
        opts.add_argument("--disable-gpu")
    # basic options
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    # Make window size deterministic
    opts.add_argument("--window-size=1200,900")
    # Minimal privacy: do not auto-open password prompts
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option('useAutomationExtension', False)

    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=opts)
    driver.set_page_load_timeout(60)
    return driver

def parse_job_card(card) -> Dict:
    # This function tries multiple selectors for robustness; LinkedIn changes DOM often.
    data = {}
    try:
        title_el = card.find_element(By.CSS_SELECTOR, "a.job-card-list__title, a.job-card-list__link, h3")
        data["title"] = title_el.text.strip()
    except:
        data["title"] = None

    try:
        company_el = card.find_element(By.CSS_SELECTOR, "h4.job-card-container__company-name, a.job-card-container__company-name, .job-card-container__company-name")
        data["company"] = company_el.text.strip()
    except:
        # fallback
        data["company"] = None

    try:
        location_el = card.find_element(By.CSS_SELECTOR, ".job-card-container__metadata-item, .job-card-list__location")
        data["location"] = location_el.text.strip()
    except:
        data["location"] = None

    try:
        date_el = card.find_element(By.CSS_SELECTOR, "time, .job-card-list__footer-wrapper span")
        data["date_posted"] = date_el.text.strip()
    except:
        data["date_posted"] = None

    try:
        # job link
        a = card.find_element(By.CSS_SELECTOR, "a.job-card-list__title, a.job-card-list__link, a")
        link = a.get_attribute("href")
        data["job_link"] = link
    except:
        data["job_link"] = None

    try:
        snippet = card.find_element(By.CSS_SELECTOR, ".job-card-list__snippet, .job-card-container__description-snippet")
        data["snippet"] = snippet.text.strip()
    except:
        data["snippet"] = None

    return data

def scrape_linkedin_jobs(search_url: str, max_pages: int = MAX_PAGES) -> List[Dict]:
    driver = build_driver()
    wait = WebDriverWait(driver, 20)
    results = []

    try:
        driver.get(search_url)
        random_delay()

        for page_num in range(max_pages):
            # Wait for job cards to appear
            # Job cards are often contained in elements with 'job-card' in the class name.
            try:
                wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-job-id], .job-card-container, .job-card-list__entity")))
            except Exception as e:
                print("No job cards found on page:", e)

            # collect cards
            cards = driver.find_elements(By.CSS_SELECTOR, "[data-job-id], .job-card-container, .job-card-list__entity, .result-card")
            print(f"Page {page_num+1}: found {len(cards)} cards")
            for c in cards:
                job = parse_job_card(c)
                # basic dedupe by link or title+company
                if job.get("job_link") or job.get("title"):
                    results.append(job)

            # Attempt to go to next page: LinkedIn has a "next" button or uses offset param in URL
            # Strategy: try to click next button; if not found, try to increment start param in URL.
            try:
                next_btn = driver.find_element(By.CSS_SELECTOR, "button[aria-label='Next'], button[aria-label='next'], a[aria-label='Next']")
                if next_btn and next_btn.is_enabled():
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_btn)
                    random_delay()
                    next_btn.click()
                else:
                    raise Exception("Next button not clickable")
            except Exception:
                # fallback: modify URL with start param (LinkedIn uses &start=XX)
                current_url = driver.current_url
                # try to find start param
                import urllib.parse as up
                parsed = up.urlparse(current_url)
                qs = up.parse_qs(parsed.query)
                start = int(qs.get("start", ["0"])[0])
                next_start = start + 25
                qs["start"] = [str(next_start)]
                new_q = up.urlencode(qs, doseq=True)
                new_url = up.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_q, parsed.fragment))
                print("Navigating to next page URL:", new_url)
                driver.get(new_url)

            random_delay()

    finally:
        driver.quit()

    # dedupe by job_link
    deduped = []
    seen = set()
    for r in results:
        key = r.get("job_link") or (r.get("title","") + "|" + r.get("company",""))
        if key not in seen:
            seen.add(key)
            deduped.append(r)
    return deduped

# ---- main entrypoint ----
if __name__ == "__main__":
    Path(OUTPUT_FILE).unlink(missing_ok=True)
    print("Starting scrape:", SEARCH_URL)
    scraped = scrape_linkedin_jobs(SEARCH_URL, max_pages=MAX_PAGES)
    print(f"Scraped {len(scraped)} unique job entries")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(scraped, f, indent=2, ensure_ascii=False)
    print("Saved to", OUTPUT_FILE)


In [None]:
!pip install webdriver-manager

In [None]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def human_sleep(base_delay=3, variance=1.5):
    """Sleep for a random time to mimic human behavior."""
    delay = base_delay + random.uniform(-variance, variance)
    time.sleep(max(delay, 1.0))  # ensure positive sleep time

def scrape_linkedin_jobs(job_title, location, num_jobs=25, base_delay=4):
    """
    Educational / academic version of LinkedIn job scraper.
    Respects polite scraping practices via randomized delays.
    """
    # Set up browser (headless for efficiency)
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-blink-features=AutomationControlled")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get("https://www.linkedin.com/jobs")
    human_sleep(base_delay)
    
    # Input fields
    title_box = driver.find_element(By.CSS_SELECTOR, "input[aria-label='Search by title, skill, or company']")
    location_box = driver.find_element(By.CSS_SELECTOR, "input[aria-label='City, state, or zip code']")
    
    # Perform search
    title_box.clear()
    title_box.send_keys(job_title)
    human_sleep(base_delay)
    location_box.clear()
    location_box.send_keys(location)
    location_box.send_keys(Keys.RETURN)
    human_sleep(base_delay * 2)
    
    # Scroll gradually to load more jobs
    for _ in range(3):
        driver.execute_script("window.scrollBy(0, document.body.scrollHeight / 3);")
        human_sleep(base_delay)
    
    # Capture job listings
    job_cards = driver.find_elements(By.CSS_SELECTOR, ".jobs-search-results__list-item")[:num_jobs]
    human_sleep(base_delay)
    
    data = []
    for idx, card in enumerate(job_cards, start=1):
        try:
            title = card.find_element(By.CSS_SELECTOR, "h3").text.strip()
            company = card.find_element(By.CSS_SELECTOR, ".base-search-card__subtitle").text.strip()
            loc = card.find_element(By.CSS_SELECTOR, ".job-search-card__location").text.strip()
            link = card.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            data.append({
                "title": title,
                "company": company,
                "location": loc,
                "link": link
            })
            print(f"[{idx}] Scraped: {title} at {company}")
            human_sleep(base_delay + random.uniform(1, 3))
        except Exception as e:
            print(f"Skipped a job card due to parsing error: {e}")
            continue
    
    driver.quit()
    return pd.DataFrame(data)

if __name__ == "__main__":
    print("Starting polite LinkedIn job scrape (for academic research)...\n")
    df = scrape_linkedin_jobs("AI Engineer", "Boston, MA", num_jobs=10)
    df.to_csv("linkedin_jobs_sample.csv", index=False)
    print("\nDone. Results saved to linkedin_jobs_sample.csv")

In [None]:
import os
print(os.getenv("LINKEDIN_EMAIL"))