In [2]:
!pip install fake-useragent

Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.2.0-py3-none-any.whl (161 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/161.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m153.6/161.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fake-useragent
Successfully installed fake-useragent-2.2.0


In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import os
import logging
from datetime import datetime
from fake_useragent import UserAgent
from urllib.parse import urljoin
import argparse

In [11]:
# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class IndeedScraper:
    """A class to scrape job listings from Indeed with custom filters."""

    def __init__(self, base_url="https://www.indeed.com"):
        """Initialize the scraper with base URL and default settings."""
        self.base_url = base_url
        self.ua = UserAgent()
        self.session = requests.Session()

    def get_random_headers(self):
        """Generate random headers to avoid bot detection."""
        return {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/',
            'DNT': '1',
            'Connection': 'keep-alive',
        }

    def build_search_url(self, job_title, location, page=0):
        """Build the search URL with pagination."""
        job_title = job_title.replace(' ', '+')
        location = location.replace(' ', '+').replace(',', '%2C')
        return f"{self.base_url}/jobs?q={job_title}&l={location}&start={page*10}"

    def scrape_jobs(self, job_title, location, min_rating=0.0, max_rating=5.0,
                   min_reviews=0, max_reviews=999999, max_pages=10, delay_range=(2, 5)):
        """Scrape job listings from Indeed with specified filters."""
        jobs = []
        page = 0

        logger.info(f"Starting job search for '{job_title}' in '{location}'")

        while page < max_pages:
            url = self.build_search_url(job_title, location, page)
            headers = self.get_random_headers()

            try:
                logger.info(f"Fetching page {page+1}: {url}")
                response = self.session.get(url, headers=headers, timeout=30)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')

                # Check for captcha
                if "captcha" in response.text.lower():
                    logger.error("Captcha detected. Stopping scrape.")
                    break

                # Find job cards
                job_cards = soup.select('.jobsearch-ResultsList .job_seen_beacon') or \
                            soup.select('.jobsearch-SerpJobCard') or \
                            soup.select('[data-testid="job-card"]')

                if not job_cards:
                    logger.info("No more job cards found.")
                    break

                for card in job_cards:
                    # Extract job post link
                    job_title_elem = card.select_one('a.jcs-JobTitle') or \
                                     card.select_one('a.jobtitle') or \
                                     card.select_one('h2 a')

                    if job_title_elem and job_title_elem.get('href'):
                        job_link = urljoin(self.base_url, job_title_elem.get('href'))
                        job_title_text = job_title_elem.text.strip()
                    else:
                        continue

                    # Extract company name
                    company_elem = card.select_one('.companyName') or \
                                   card.select_one('.company')
                    company_name = company_elem.text.strip() if company_elem else "Unknown Company"

                    # Extract company overview link
                    company_link_elem = card.select_one('a[href*="/cmp/"]')
                    company_overview_link = urljoin(self.base_url, company_link_elem.get('href')) if company_link_elem else ""

                    # Extract rating
                    rating_elem = card.select_one('.ratingsDisplay') or \
                                 card.select_one('[class*="rating"]')

                    rating = 0.0
                    if rating_elem:
                        rating_text = rating_elem.get('aria-label') or rating_elem.text
                        try:
                            import re
                            rating_match = re.search(r'(\d+\.\d+)', rating_text)
                            if rating_match:
                                rating = float(rating_match.group(1))
                        except (ValueError, AttributeError):
                            pass

                    # Extract review count
                    reviews_elem = card.select_one('a[href*="/reviews"]')

                    review_count = 0
                    if reviews_elem:
                        review_text = reviews_elem.text.strip()
                        try:
                            import re
                            count_match = re.search(r'(\d+)', review_text)
                            if count_match:
                                review_count = int(count_match.group(1))
                        except (ValueError, AttributeError):
                            pass

                    # Apply filters
                    if (min_rating <= rating <= max_rating) and (min_reviews <= review_count <= max_reviews):
                        jobs.append({
                            'Job Title': job_title_text,
                            'Company Name': company_name,
                            'Company Overview Link': company_overview_link,
                            'Job Post Link': job_link,
                            'Company Rating': rating,
                            'Company Review Count': review_count,
                            'Date Scraped': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        })

                logger.info(f"Page {page+1}: Found {len(jobs)} matching jobs so far")

                # Increment page and add delay
                page += 1
                delay = random.uniform(delay_range[0], delay_range[1])
                logger.info(f"Waiting {delay:.2f} seconds before next request...")
                time.sleep(delay)

            except requests.RequestException as e:
                logger.error(f"Error fetching page {page+1}: {str(e)}")
                break

            except Exception as e:
                logger.error(f"Unexpected error: {str(e)}")
                break

        logger.info(f"Scraping completed. Found {len(jobs)} matching jobs.")
        return jobs

    def save_to_csv(self, jobs, filename=None):
        """Save scraped job data to a CSV file."""
        if not jobs:
            logger.warning("No jobs found matching the criteria.")
            return None

        if not filename:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f"indeed_jobs_{timestamp}.csv"

        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = list(jobs[0].keys())
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for job in jobs:
                writer.writerow(job)

        logger.info(f"Saved {len(jobs)} jobs to {filename}")
        return filename

# Simplified example usage - just run this directly
scraper = IndeedScraper()

# Change these parameters as needed
job_title = "Software Developer"
location = "New York, NY"
min_rating = 3.5
max_rating = 5.0
min_reviews = 10
max_reviews = 500
max_pages = 3  # Limiting to 3 pages for quicker demonstration

# Run the scraper
jobs = scraper.scrape_jobs(
    job_title=job_title,
    location=location,
    min_rating=min_rating,
    max_rating=max_rating,
    min_reviews=min_reviews,
    max_reviews=max_reviews,
    max_pages=max_pages
)

# Save results
if jobs:
    output_file = scraper.save_to_csv(jobs)
    print(f"Saved {len(jobs)} jobs to {output_file}")
else:
    print("No jobs found matching your criteria.")

ERROR:__main__:Error fetching page 1: 403 Client Error: Forbidden for url: https://www.indeed.com/jobs?q=Software+Developer&l=New+York%2C+NY&start=0


No jobs found matching your criteria.
