In [12]:
!pip install --upgrade selenium

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

In [3]:
#!/usr/bin/env pyth\\\\\\\\\\\\\\\\
from webdriver_manager.chrome import ChromeDriverManager

def initialize_driver():
    """Initialize Chrome with stealth settings using your own browser profile (not incognito)."""
    options = Options()
    
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
    ]
    options.add_argument(f"user-agent={random.choice(user_agents)}")
    
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-logging"])
   
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def get_indeed_url(job_keyword, location):
    """Construct the Indeed search URL."""
    base_url = "https://pk.indeed.com/jobs"
    query = f"?q={job_keyword.replace(' ', '+')}&l={location.replace(' ', '+')}"
    return base_url + query

def human_like_scroll(driver):
    """Simulate human-like scrolling with minimal steps."""
    scroll_script = """
    var scrollHeight = document.body.scrollHeight;
    var currentScroll = 0;
    var scrollSteps = %d;
    var scrollStep = scrollHeight / scrollSteps;
    function smoothScroll() {
        if (currentScroll < scrollHeight) {
            currentScroll += scrollStep;
            window.scrollTo(0, currentScroll);
            setTimeout(smoothScroll, %d);
        }
    }
    smoothScroll();
    """
    steps = random.randint(2, 5)
    delay = random.randint(100, 200)
    driver.execute_script(scroll_script % (steps, delay))
    time.sleep(random.uniform(0.5, 1.5))

def random_mouse_movement(driver):
    """Simulate small random mouse movements."""
    actions = ActionChains(driver)
    for _ in range(random.randint(1, 2)):
        x_offset = random.randint(-30, 30)
        y_offset = random.randint(-30, 30)
        actions.move_by_offset(x_offset, y_offset)
        actions.pause(random.uniform(0.1, 0.3))
    actions.perform()

def scrape_job_details(driver, job_url):
    """Scrape details from an individual job page."""
    try:
        driver.execute_script("window.open('');")
        time.sleep(random.uniform(0.5, 1.2))
        driver.switch_to.window(driver.window_handles[1])
        
        print(f"Scraping job URL: {job_url}")
        driver.get(job_url)
        
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.jobsearch-JobInfoHeader-title'))
        )
        time.sleep(random.uniform(1.0, 2.0))
        random_mouse_movement(driver)
        
        job_title = driver.find_element(By.CSS_SELECTOR, 'h1.jobsearch-JobInfoHeader-title').text
        company_name = driver.find_element(By.CSS_SELECTOR, 'div[data-company-name="true"]').text
        job_location = driver.find_element(By.CSS_SELECTOR, 'div.jobsearch-JobInfoHeader-subtitle div:last-child').text
        job_description = driver.find_element(By.ID, 'jobDescriptionText').text
        
    except Exception as e:
        print(f"Error in scrape_job_details: {str(e)}")
        return None
    finally:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(random.uniform(1.0, 2.0))
        
    return {
        'job_url': job_url,
        'job_role': job_title,
        'company': company_name,
        'job_location': job_location,
        'job_description': job_description
    }

def main():
    """Scrape only the first job listing from the search results page."""
    job_keyword = "Data Scientist"
    location = "Pakistan"
    
    driver = initialize_driver()
    url = get_indeed_url(job_keyword, location)
    
    try:
        print(f"Target URL: {url}")
        driver.get(url)
        time.sleep(random.uniform(2.0, 4.0))
        human_like_scroll(driver)
        
        try:
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'a.jcs-JobTitle'))
            )
        except TimeoutException:
            print("Timeout: Job listing did not load.")
            driver.quit()
            return
        
        first_job = driver.find_element(By.CSS_SELECTOR, 'a.jcs-JobTitle')
        ActionChains(driver).move_to_element(first_job).pause(random.uniform(0.2, 0.7)).click().perform()
        time.sleep(random.uniform(1.0, 2.0))
        job_details = scrape_job_details(driver, first_job.get_attribute('href'))
        
    finally:
        driver.quit()
    
    if job_details:
        df = pd.DataFrame([job_details])
        df.to_csv('single_job_data.csv', index=False)
        print("Saved 1 job to single_job_data.csv")
    else:
        print("No data to save")

if __name__ == "__main__":
    main()


Target URL: https://pk.indeed.com/jobs?q=Data+Scientist&l=Pakistan
Timeout: Job listing did not load.
