In [1]:
import urllib.parse
import requests
from bs4 import BeautifulSoup
import re
import time

In [2]:
def build_linkedin_search_url(job_title, location, start=0):
    """
    Constructs a properly formatted LinkedIn job search URL.

    Args:
        job_title: Job title to search for (e.g., "Software Engineer")
        location: Location to search in (e.g., "San Francisco, CA" or "remote")
        start: Pagination offset (0, 25, 50, etc.)

    Returns:
        Properly formatted and URL-encoded LinkedIn job search URL
    """
    base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"

    params = {
        "keywords": job_title,
        "location": location,
        "start": start
    }

    query_string = urllib.parse.urlencode(params)
    return f"{base_url}?{query_string}"


def test_linkedin_connection(url):
    """
    Tests HTTP connectivity to LinkedIn job search URL.

    Args:
        url: The URL to test

    Returns:
        Tuple of (status_code, response_length)
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.linkedin.com/"
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        return response.status_code, len(response.text)
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to LinkedIn: {e}")
        return None, 0


In [3]:
url = build_linkedin_search_url("Data Scientist", "remote")
print(f"Generated URL: {url}\n")

status, length = test_linkedin_connection(url)

if status:
    print(f"Status Code: {status}")
    print(f"Response Length: {length} characters")

    # Verify expectations
    if status == 200 and length > 1000:
        print("\n✓ Connection test passed!")
    else:
        print(f"\n✗ Connection test failed - Status: {status}, Length: {length}")
else:
    print("✗ Connection failed")


Generated URL: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Scientist&location=remote&start=0

Status Code: 429
Response Length: 24633 characters

✗ Connection test failed - Status: 429, Length: 24633


In [4]:
def parse_job_cards(html_content):
    """
    Parses LinkedIn job search HTML to extract job listings.
    
    Args:
        html_content: HTML string from LinkedIn job search
    
    Returns:
        List of dictionaries containing job information
    """
    jobs = []
    
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find all job card list items
        job_cards = soup.find_all('li')
        
        for card in job_cards:
            try:
                # Extract job title
                title_elem = card.find(class_='base-search-card__title')
                job_title = title_elem.get_text(strip=True) if title_elem else "N/A"
                
                # Extract company name
                company_elem = card.find(class_='base-search-card__subtitle')
                company = company_elem.get_text(strip=True) if company_elem else "N/A"
                
                # Extract location
                location_elem = card.find(class_='job-search-card__location')
                location = location_elem.get_text(strip=True) if location_elem else "N/A"
                
                # Extract posted date
                posted_elem = card.find(class_='job-search-card__listdate')
                posted_ago = posted_elem.get_text(strip=True) if posted_elem else "N/A"
                
                # Extract job URL
                link_elem = card.find('a', class_='base-card__full-link')
                job_url = link_elem.get('href', 'N/A') if link_elem else "N/A"
                
                # Extract job ID from URL
                job_id = None
                if job_url != "N/A":
                    match = re.search(r'/jobs/view/(\d+)', job_url)
                    if match:
                        job_id = match.group(1)
                
                # Only add if we have at least a title and URL
                if job_title != "N/A" and job_url != "N/A":
                    jobs.append({
                        'job_id': job_id,
                        'job_title': job_title,
                        'company': company,
                        'location': location,
                        'posted_ago': posted_ago,
                        'job_url': job_url
                    })
                    
            except Exception as e:
                # Skip individual cards that fail to parse
                print(f"Warning: Failed to parse a job card: {e}")
                continue
        
    except Exception as e:
        print(f"Error parsing HTML content: {e}")
        return []
    
    return jobs

In [5]:
def fetch_jobs(title: str):
    # Test with real request: Fetch Software Engineer jobs in remote
    test_url = build_linkedin_search_url(title, "remote")
    print(f"Fetching jobs from: {test_url}\n")

    # Make the request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.linkedin.com/"
    }

    try:
        response = requests.get(test_url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            print(f"✓ Successfully fetched data (Status: {response.status_code})\n")
            
            # Parse the jobs
            jobs = parse_job_cards(response.text)
            
            print(f"Found {len(jobs)} jobs\n")
            print("=" * 80)
            print("First 3 jobs:")
            print("=" * 80)
            
            for i, job in enumerate(jobs, 1):
                print(f"\n{i}. {job['job_title']}")
                print(f"   Company: {job['company']}")
                print(f"   Location: {job['location']}")
                print(f"   Posted: {job['posted_ago']}")
                print(f"   Job ID: {job['job_id']}")
                print(f"   URL: {job['job_url']}..." if len(job['job_url']) > 80 else f"   URL: {job['job_url']}")
                
                try:
                    # Verify job_url and job_id exist
                    assert job['job_url'] is not None and job['job_url'] != "N/A", f"Job {i} missing URL"
                    assert job['job_id'] is not None, f"Job {i} missing ID"
                except:
                    continue
            
            print("\n" + "=" * 80)
            print("✓ All tests passed! All jobs have valid URLs and IDs.")
            
        else:
            print(f"✗ Failed to fetch data (Status: {response.status_code})")
            
    except Exception as e:
        print(f"✗ Error during test: {e}")

In [6]:
fetch_jobs("Machine Learning Engineer")

Fetching jobs from: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Machine+Learning+Engineer&location=remote&start=0

✓ Successfully fetched data (Status: 200)

Found 10 jobs

First 3 jobs:

1. Machine Learning Engineer
   Company: Certara
   Location: United States
   Posted: 1 day ago
   Job ID: None
   URL: https://www.linkedin.com/jobs/view/machine-learning-engineer-at-certara-4304414415?position=1&pageNum=0&refId=grRwL11JXg2Z3NMNgrFjiw%3D%3D&trackingId=0wuWxJ1ynu7nTRrPsP0ZHQ%3D%3D...

2. Research Executive - Data Scientist
   Company: Unilever
   Location: Bengaluru, Karnataka, India
   Posted: 2 days ago
   Job ID: None
   URL: https://in.linkedin.com/jobs/view/research-executive-data-scientist-at-unilever-4315071495?position=2&pageNum=0&refId=grRwL11JXg2Z3NMNgrFjiw%3D%3D&trackingId=DGIoeFrPZgXfyTwcKu9Z7g%3D%3D...

3. Staff Data Scientist
   Company: LinkedIn
   Location: Bengaluru, Karnataka, India
   Posted: 2 days ago
   Job ID: None
   URL: ht

In [7]:
class JobStorage:
    """
    In-memory storage for job data with description management.
    """
    
    def __init__(self):
        self.jobs = []
    
    def add_jobs(self, job_list):
        """
        Store jobs in internal list.
        
        Args:
            job_list: List of job dictionaries
        """
        for job in job_list:
            # Ensure each job has a description field
            if 'description' not in job:
                job['description'] = None
            self.jobs.append(job)
    
    def get_all_jobs(self):
        """
        Return all stored jobs.
        
        Returns:
            List of all job dictionaries
        """
        return self.jobs
    
    def get_jobs_without_description(self):
        """
        Return jobs where description is None.
        
        Returns:
            List of jobs without descriptions
        """
        return [job for job in self.jobs if job.get('description') is None]
    
    def update_job_description(self, job_id, description):
        """
        Update specific job's description.
        
        Args:
            job_id: Job ID to update
            description: Description text to set
        """
        for job in self.jobs:
            if job.get('job_id') == job_id:
                job['description'] = description
                return True
        return False
    
    def get_stats(self):
        """
        Return statistics about stored jobs.
        
        Returns:
            Dictionary with job statistics
        """
        total = len(self.jobs)
        with_desc = sum(1 for job in self.jobs if job.get('description') and job.get('description') != "ERROR")
        without_desc = sum(1 for job in self.jobs if job.get('description') is None)
        errors = sum(1 for job in self.jobs if job.get('description') == "ERROR")
        
        return {
            'total_jobs': total,
            'jobs_with_descriptions': with_desc,
            'jobs_without_descriptions': without_desc,
            'jobs_with_errors': errors
        }

In [39]:
def fetch_job_description(job_url):
    """
    Fetch job description from LinkedIn job URL.
    
    Args:
        job_url: URL to the job posting
    
    Returns:
        Job description text or None if fetch fails
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.linkedin.com/"
    }
    
    try:
        response = requests.get(job_url, headers=headers, timeout=10)
        
        print(response)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            print("nigga 2")
            # Try multiple possible selectors for job description
            desc_elem = (
                soup.find('div', class_='show-more-less-html__markup') or
                soup.find('div', class_='description__text') or
                soup.find('section', class_='description') or
                soup.find('div', class_='description') or
                soup.find('article',class_='jobs-description__container jobs-description__container--condensed') or 
                soup.find('div',class_='jobs-description__content jobs-description-content jobs-description__content--condensed') or
                soup.find('div', class_=re.compile(r'^job_description'))

            )

            print("nigga")
            
            if desc_elem:
                return desc_elem
                description = desc_elem.get_text(strip=True)
                return description if description else None
        
        return None
        
    except Exception as e:
        print(f"Error fetching description: {e}")
        return None
    



In [40]:
desc = fetch_job_description('https://www.linkedin.com/jobs/view/data-scientist-data-analytics-%E2%80%93-customer-loyalty-marketing-at-circle-k-4316137579/?position=3&pageNum=0&refId=r8vm%2BrG%2B0ZTbNredBK8TzQ%3D%3D&trackingId=brKmbAor271tMuHOwExs2Q%3D%3D')

<Response [999]>


In [38]:
type(desc)

NoneType

In [9]:
def fetch_all_descriptions(storage, delay=3):
    """
    Fetch descriptions for all jobs without descriptions.
    
    Args:
        storage: JobStorage instance
        delay: Seconds to wait between requests (default: 3)
    
    Returns:
        Count of successful fetches
    """
    jobs_to_fetch = storage.get_jobs_without_description()
    total = len(jobs_to_fetch)
    successful = 0
    
    print(f"Fetching descriptions for {total} jobs...\n")
    
    for i, job in enumerate(jobs_to_fetch, 1):
        job_title = job.get('job_title', 'Unknown')
        job_id = job.get('job_id')
        job_url = job.get('job_url')
        
        print(f"[{i}/{total}] Fetching description for: {job_title}...")
        
        try:
            description = fetch_job_description(job_url)
            
            if description:
                storage.update_job_description(job_id, description)
                successful += 1
                print(f"  ✓ Success ({len(description)} chars)")
            else:
                storage.update_job_description(job_id, "ERROR")
                print(f"  ✗ Failed to extract description")
                
        except Exception as e:
            storage.update_job_description(job_id, "ERROR")
            print(f"  ✗ Error: {e}")
        
        # Sleep between requests to be polite
        if i < total:
            time.sleep(delay)
    
    print(f"\nCompleted: {successful}/{total} descriptions fetched successfully")
    return successful

In [15]:
def scrape_multiple_pages(job_title, location, time_filter=None, limit=25):
    """
    Scrape multiple pages of LinkedIn jobs.
    
    Args:
        job_title: Job title to search
        location: Location to search
        time_filter: Not implemented yet (placeholder)
        limit: Maximum number of jobs to fetch
    
    Returns:
        List of job dictionaries (without descriptions)
    """
    all_jobs = []
    start = 0
    
    while len(all_jobs) < limit:
        url = build_linkedin_search_url(job_title, location, start)
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Referer": "https://www.linkedin.com/"
        }
        
        try:
            response = requests.get(url, headers=headers, timeout=10)
            
            if response.status_code == 200:
                jobs = parse_job_cards(response.text)
                
                if not jobs:
                    # No more jobs found
                    break
                
                all_jobs.extend(jobs)
                
                # Stop if we've reached the limit
                if len(all_jobs) >= limit:
                    all_jobs = all_jobs[:limit]
                    break
                
                start += 25
                time.sleep(2)  # Be polite between page requests
            else:
                print(f"Failed to fetch page at start={start}")
                break
                
        except Exception as e:
            print(f"Error fetching page: {e}")
            break
    
    return all_jobs

In [16]:
def scrape_with_storage(job_title, location, time_filter=None, limit=25):
    """
    Main workflow: Scrape jobs and fetch descriptions using storage.
    
    Args:
        job_title: Job title to search
        location: Location to search
        time_filter: Time filter (not implemented)
        limit: Maximum number of jobs to fetch
    
    Returns:
        List of all jobs with descriptions
    """
    # Initialize storage
    storage = JobStorage()
    
    print(f"Scraping {limit} jobs for '{job_title}' in '{location}'...\n")
    
    # Scrape job cards (without descriptions)
    jobs = scrape_multiple_pages(job_title, location, time_filter, limit)
    
    # Add to storage
    storage.add_jobs(jobs)
    
    # Print initial stats
    stats = storage.get_stats()
    print(f"\n{'='*80}")
    print(f"Found {stats['total_jobs']} jobs, fetching descriptions...")
    print(f"{'='*80}\n")
    
    # Fetch all descriptions
    fetch_all_descriptions(storage, delay=2)
    
    # Print final stats
    final_stats = storage.get_stats()
    print(f"\n{'='*80}")
    print("Final Statistics:")
    print(f"  Total jobs: {final_stats['total_jobs']}")
    print(f"  With descriptions: {final_stats['jobs_with_descriptions']}")
    print(f"  Without descriptions: {final_stats['jobs_without_descriptions']}")
    print(f"  Errors: {final_stats['jobs_with_errors']}")
    print(f"{'='*80}\n")
    
    return storage.get_all_jobs()

In [13]:
jobs

[{'job_id': None,
  'job_title': 'Cientista de Dados Jr.',
  'company': 'UOL',
  'location': 'São Paulo, São Paulo, Brazil',
  'posted_ago': '1 day ago',
  'job_url': 'https://br.linkedin.com/jobs/view/cientista-de-dados-jr-at-uol-4313390052?position=1&pageNum=0&refId=eL61fpBYTSOC3wKoUqnKqA%3D%3D&trackingId=fof3H7h20H28X2NouS1CMQ%3D%3D',
  'description': 'ERROR'},
 {'job_id': None,
  'job_title': 'Data Scientists - All Levels',
  'company': 'Lensa',
  'location': 'Boca Raton, FL',
  'posted_ago': 'N/A',
  'job_url': 'https://www.linkedin.com/jobs/view/data-scientists-all-levels-at-lensa-4305439528?position=2&pageNum=0&refId=eL61fpBYTSOC3wKoUqnKqA%3D%3D&trackingId=lDuRzruehWCxUFgNxyoKxw%3D%3D',
  'description': None},
 {'job_id': None,
  'job_title': 'Data Scientist, Data & Analytics – Customer, Loyalty & Marketing',
  'company': 'Circle K',
  'location': 'Charlotte, NC',
  'posted_ago': 'N/A',
  'job_url': 'https://www.linkedin.com/jobs/view/data-scientist-data-analytics-%E2%80%93-cust

In [17]:
# Test: Scrape 10 jobs with descriptions
print("Testing JobStorage with 10 Data Scientist jobs...\n")

# Test the complete workflow
jobs = scrape_with_storage("Data Scientist", "remote", "past week", 10)

# Verify results
print("\nVerification:")
print(f"✓ Total jobs returned: {len(jobs)}")

jobs_with_desc = sum(1 for j in jobs if j.get('description') and j['description'] != "ERROR")
print(f"✓ Jobs with descriptions: {jobs_with_desc}")

jobs_with_errors = sum(1 for j in jobs if j.get('description') == "ERROR")
print(f"✓ Jobs with errors: {jobs_with_errors}")

# Show sample job
if jobs:
    print(f"\n{'='*80}")
    print("Sample Job (first one):")
    print(f"{'='*80}")
    sample = jobs[0]
    print(f"Title: {sample['job_title']}")
    print(f"Company: {sample['company']}")
    print(f"Location: {sample['location']}")
    print(f"Posted: {sample['posted_ago']}")
    print(f"Job ID: {sample['job_id']}")
    
    desc = sample.get('description', 'N/A')
    if desc and desc != "ERROR":
        desc_preview = desc[:200] + "..." if len(desc) > 200 else desc
        print(f"Description preview: {desc_preview}")
    else:
        print(f"Description: {desc}")

Testing JobStorage with 10 Data Scientist jobs...

Scraping 10 jobs for 'Data Scientist' in 'remote'...


Found 10 jobs, fetching descriptions...

Fetching descriptions for 10 jobs...

[1/10] Fetching description for: Cientista de Dados Jr....
  ✗ Failed to extract description
[2/10] Fetching description for: Data Scientists - All Levels...
  ✗ Failed to extract description
[3/10] Fetching description for: Data Scientist, Data & Analytics – Customer, Loyalty & Marketing...
  ✗ Failed to extract description
[4/10] Fetching description for: Data Scientist (L5) - Ads (Forecasting)...
  ✗ Failed to extract description
[5/10] Fetching description for: Research Scientist, Mathematical Sciences...
  ✗ Failed to extract description
[6/10] Fetching description for: CIENTISTA DE DADOS PLENO (100% REMOTO)...
  ✗ Failed to extract description
[7/10] Fetching description for: Junior Data Scientist – New Grad...
  ✗ Failed to extract description
[8/10] Fetching description for: Associate Software En

In [18]:
jobs 


[{'job_id': None,
  'job_title': 'Cientista de Dados Jr.',
  'company': 'UOL',
  'location': 'São Paulo, São Paulo, Brazil',
  'posted_ago': '1 day ago',
  'job_url': 'https://br.linkedin.com/jobs/view/cientista-de-dados-jr-at-uol-4313390052?position=1&pageNum=0&refId=r8vm%2BrG%2B0ZTbNredBK8TzQ%3D%3D&trackingId=uIGPqRmKL2Zp%2BrRfYZhLww%3D%3D',
  'description': 'ERROR'},
 {'job_id': None,
  'job_title': 'Data Scientists - All Levels',
  'company': 'Lensa',
  'location': 'Boca Raton, FL',
  'posted_ago': 'N/A',
  'job_url': 'https://www.linkedin.com/jobs/view/data-scientists-all-levels-at-lensa-4305439528?position=2&pageNum=0&refId=r8vm%2BrG%2B0ZTbNredBK8TzQ%3D%3D&trackingId=j1JAMe1oKfIazQ%2Fz%2FohwGg%3D%3D',
  'description': None},
 {'job_id': None,
  'job_title': 'Data Scientist, Data & Analytics – Customer, Loyalty & Marketing',
  'company': 'Circle K',
  'location': 'Charlotte, NC',
  'posted_ago': 'N/A',
  'job_url': 'https://www.linkedin.com/jobs/view/data-scientist-data-analytics-