In [85]:
# # Install if you have never used these: unblock the lines below to install if needed
# !pip install webdriver-manager
# !pip3 install lxml
# !pip3 install selenium
# !pip3 install webdriver_manager
# !pip install --upgrade pip
# !pip install -U selenium
# !pip install msedge-selenium-tools


In [86]:
# --------- import necessary modules -------

# For webscraping
from bs4 import BeautifulSoup

# Parsing and creating xml data
from lxml import etree as et

# Store data as a csv file written out
from csv import writer

# In general to use with timing our function calls to Indeed
import time

# Assist with creating incremental timing for our scraping to seem more human
from time import sleep

# Dataframe stuff
import pandas as pd

# Random integer for more realistic timing for clicks, buttons and searches during scraping
from random import randint

# Multi Threading
import threading

# Threading:
from concurrent.futures import ThreadPoolExecutor, wait

In [87]:
import selenium

# Check version I am running
selenium.__version__

'4.15.2'

In [88]:
# Selenium 4:

from selenium import webdriver

# Starting/Stopping Driver: can specify ports or location but not remote access
from selenium.webdriver.chrome.service import Service as ChromeService

# Manages Binaries needed for WebDriver without installing anything directly
from webdriver_manager.chrome import ChromeDriverManager

In [89]:
# Allows searchs similar to beautiful soup: find_all
from selenium.webdriver.common.by import By

# Try to establish wait times for the page to load
from selenium.webdriver.support.ui import WebDriverWait

# Wait for specific condition based on defined task: web elements, boolean are examples
from selenium.webdriver.support import expected_conditions as EC

# Used for keyboard movements, up/down, left/right,delete, etc
from selenium.webdriver.common.keys import Keys

# Locate elements on page and throw error if they do not exist
from selenium.common.exceptions import NoSuchElementException

In [90]:
# Allows you to cusotmize: ingonito mode, maximize window size, headless browser, disable certain features, etc
option= webdriver.ChromeOptions()

# Going undercover:
option.add_argument("--incognito")


# # Consider this if the application works and you know how it works for speed ups and rendering!

# option.add_argument('--headless=chrome')

In [91]:
    # Define job search keyword for Data Analyst
    job_search_keyword = ['Python+developer']

    # Finding position, radius=35 miles, sort by date and starting page
    # pagination_url = 'https://www.indeed.com/jobs?q={}&l=' 
    pagination_url = 'https://in.indeed.com/jobs?q={}&l=&from=searchOnHP&vjk=71d709239fecfc67'  

    # Print the pagination URL for Data Analyst
    print(pagination_url.format(job_search_keyword[0], 0))


https://in.indeed.com/jobs?q=Python+developer&l=&from=searchOnHP&vjk=71d709239fecfc67


In [92]:
start = time.time()

job = 'Python+developer'

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)

driver.get(pagination_url.format(job_search_keyword[0], 0))

# Add a sleep to simulate human-like interactions
sleep(randint(2, 6))

# Find the job count element
job_count_element = driver.find_element(By.CLASS_NAME, 'jobsearch-JobCountAndSortPane-jobCount')

# Extract the job count text
job_count_text = job_count_element.text

# Extract the numerical job count
# job_count = int(job_count_text.split(' ')[0])
# Extract the numerical job count, removing commas
job_count = int(job_count_text.split(' ')[0].replace(',', ''))



# Calculate the maximum iterable pages
max_iter_pgs = job_count // 15

# Continue with the rest of your code if needed

driver.quit()  # Closing the browser session

end = time.time()

print(end - start, 'seconds to complete action!')
print('-----------------------')
print('Max Iterable Pages for this search:', max_iter_pgs)
 


21.27311062812805 seconds to complete action!
-----------------------
Max Iterable Pages for this search: 619


In [93]:
print(job_count)

9286


In [94]:
def try_except(func, default_value):
    try:
        return func()
    except NoSuchElementException:
        return default_value

start = time.time()

job = 'Python+developer'

job_lst = []
salary_list = []

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
sleep(randint(2, 6))

# Set a limit for the number of pages to print debug messages
debug_pages_limit = 13
pages_processed = 0

for i in range(0, max_iter_pgs):
    driver.get(pagination_url.format(job, '', i * 10))

    print(f"Page {i+1} loaded successfully.")

    sleep(randint(2, 4))

    job_page = driver.find_element(By.ID, "mosaic-jobResults")
    jobs = job_page.find_elements(By.CLASS_NAME, "job_seen_beacon")

    for jj in jobs:
        job_title = jj.find_element(By.CLASS_NAME, "jobTitle")

        print(f"Processing job: {job_title.text}")

        job_lst.append([
            job_title.text,
            job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href"),
            job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("id"),
            try_except(lambda: jj.find_element(By.CLASS_NAME, "companyName").text, None),
            job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
        ])

        try:
            salary_list.append(jj.find_element(By.CLASS_NAME, "salary-snippet-container").text)
        except NoSuchElementException:
            try:
                salary_list.append(jj.find_element(By.CLASS_NAME, "estimated-salary").text)
            except NoSuchElementException:
                salary_list.append(None)

    # Increment the pages processed counter
    pages_processed += 1

    # Check if we have reached the debug pages limit
    if pages_processed >= debug_pages_limit:
        print(f"Debug messages truncated after processing {debug_pages_limit} pages.")
        break

driver.quit()

end = time.time()

print(end - start, 'seconds to complete Query!')


Page 1 loaded successfully.
Processing job: Senior Python Developer (Fast Api/Micro services)
Processing job: Python Developer
Processing job: Python Developer , Associate
Processing job: Python Developer
Processing job: Python Flask Developer
Processing job: Python Developer
Processing job: Python Developer
Processing job: Python Developer
Processing job: Python Developer
Processing job: Python Developer & Data
Processing job: Python Developer
Processing job: Python Developer
Processing job: Python Developer
Processing job: Devops Engineer
Processing job: Python Developer (Ahmedabad)
Page 2 loaded successfully.
Processing job: Senior Python Developer (Fast Api/Micro services)
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Processing job: 
Page 3 loaded successfully.
Processing job: Senior Python Developer (Fast Ap

In [95]:
job_lst[0:2]
salary_list[0:3]

['₹3,00,000 - ₹8,00,000 a year', '₹45,000 - ₹65,000 a month', None]

In [76]:
salary_list[0:3]

[None, None, None]

In [None]:
start = time.time()

job = 'Data+Analyst'

job_lst = []
salary_list = []

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
sleep(randint(2, 6))

for i in range(0, max_iter_pgs):
    driver.get(paginaton_url.format(job, '', i * 10))

    sleep(randint(2, 8))

    job_page = driver.find_element(By.ID, "mosaic-jobResults")
    jobs = job_page.find_elements(By.CLASS_NAME, "job_seen_beacon")

    for jj in jobs:
        job_title = jj.find_element(By.CLASS_NAME, "jobTitle")

        job_lst.append([
            job_title.text,
            job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href"),
            job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("id"),
            jj.find_element(By.CLASS_NAME, "companyName").text,
            jj.find_element(By.CLASS_NAME, "companyLocation").text,
            jj.find_element(By.CLASS_NAME, "date").text,
            job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
        ])

        try:
            salary_list.append(jj.find_element(By.CLASS_NAME, "salary-snippet-container").text)
        except NoSuchElementException:
            try:
                salary_list.append(jj.find_element(By.CLASS_NAME, "estimated-salary").text)
            except NoSuchElementException:
                salary_list.append(None)

driver.quit()

end = time.time()

print(end - start, 'seconds to complete Query!')


In [None]:
start = time.time()

job = 'Data+Analyst'

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)

driver.get(pagination_url.format(job, 0))

# Add a sleep to simulate human-like interactions
sleep(randint(2, 6))

# Find the job count element
job_count_element = driver.find_element(By.CLASS_NAME, 'jobsearch-JobCountAndSortPane-jobCount')

# Extract the job count text
job_count_text = job_count_element.text

# Extract the numerical job count, removing commas
job_count = int(job_count_text.split(' ')[0].replace(',', ''))

# Calculate the maximum iterable pages
max_iter_pgs = job_count // 15

# Initialize lists to store job information
job_titles = []
job_links = []
company_names = []
locations = []
posting_dates = []

# Iterate through multiple pages of job listings
for i in range(0, max_iter_pgs):
    driver.get(pagination_url.format(job, i * 10))

    # Add a sleep to simulate human-like interactions
    sleep(randint(2, 4))

    job_page = driver.find_element(By.ID, "mosaic-jobResults")
    jobs = job_page.find_elements(By.CLASS_NAME, "job_seen_beacon")

    # Extract job details and store them in lists
    for jj in jobs:
        job_title = jj.find_element(By.CLASS_NAME, "jobTitle")
        job_titles.append(job_title.text)
        job_links.append(job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href"))
        company_names.append(jj.find_element(By.CLASS_NAME, "companyName").text)
        locations.append(jj.find_element(By.CLASS_NAME, "companyLocation").text)
        posting_dates.append(jj.find_element(By.CLASS_NAME, "date").text)

# Close the WebDriver session
driver.quit()

end = time.time()

print(end - start, 'seconds to complete action!')
print('-----------------------')
print('Job Titles:', job_titles)
print('Job Links:', job_links)
print('Company Names:', company_names)
print('Locations:', locations)
print('Posting Dates:', posting_dates)


In [None]:
# Pagination: PRACTICE

start = time.time()


job_='Data+Engineer'
location='Washington'


job_lst=[]
job_description_list_href=[]

# job_description_list = []
salary_list=[]


driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),
                         options=option)
sleep(randint(2, 6))

# driver.get("https://www.indeed.com/q-USA-jobs.html")

for i in range(0,max_iter_pgs):
    driver.get(paginaton_url.format(job_,location,i*10))
    
    
    sleep(randint(2, 4))

    job_page = driver.find_element(By.ID,"mosaic-jobResults")
    jobs = job_page.find_elements(By.CLASS_NAME,"job_seen_beacon") # return a list

    for jj in jobs:
        job_title = jj.find_element(By.CLASS_NAME,"jobTitle")
#         print(job_title.text)
        
# Href's to get full job description (need to re-terate to get full info)
# Reference ID for each job used by indeed         
# Finding the company name        
# Location
# Posting date
# Job description

        job_lst.append([job_title.text,
        job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("href"),
        job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("id"),      
        jj.find_element(By.CLASS_NAME,"companyName").text,       
        jj.find_element(By.CLASS_NAME,"companyLocation").text,
        jj.find_element(By.CLASS_NAME,"date").text,
        job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("href")])
        

        try: # I removed the metadata attached to this class name to work!
            salary_list.append(jj.find_element(By.CLASS_NAME,"salary-snippet-container").text)

        except NoSuchElementException: 
            try: 
                salary_list.append(jj.find_element(By.CLASS_NAME,"estimated-salary").text)
                
            except NoSuchElementException:
                salary_list.append(None)
      
                
#         # Click the job element to get the description
#         job_title.click()
        
#         # Help to load page so we can find and extract data
#         sleep(randint(3, 5))

#         try: 
#             job_description_list.append(driver.find_element(By.ID,"jobDescriptionText").text)
            
#         except: 
            
#             job_description_list.append(None)

driver.quit() 


end = time.time()

print(end - start,'seconds to complete Query!')

# alternate way to grab the info for job description to make it faster:


In [None]:
# Pagination: PRACTICE

start = time.time()


job_='Data+Engineer'
location='Washington'


job_lst=[]
job_description_list_href=[]

# job_description_list = []
salary_list=[]


driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),
                         options=option)
sleep(randint(2, 6))

# driver.get("https://www.indeed.com/q-USA-jobs.html")

for i in range(0,max_iter_pgs):
    driver.get(paginaton_url.format(job_,location,i*10))
    
    
    sleep(randint(2, 4))

    job_page = driver.find_element(By.ID,"mosaic-jobResults")
    jobs = job_page.find_elements(By.CLASS_NAME,"job_seen_beacon") # return a list

    for jj in jobs:
        job_title = jj.find_element(By.CLASS_NAME,"jobTitle")
#         print(job_title.text)
        
# Href's to get full job description (need to re-terate to get full info)
# Reference ID for each job used by indeed         
# Finding the company name        
# Location
# Posting date
# Job description

        job_lst.append([job_title.text,
        job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("href"),
        job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("id"),      
        jj.find_element(By.CLASS_NAME,"companyName").text,       
        jj.find_element(By.CLASS_NAME,"companyLocation").text,
        jj.find_element(By.CLASS_NAME,"date").text,
        job_title.find_element(By.CSS_SELECTOR,"a").get_attribute("href")])
        

        try: # I removed the metadata attached to this class name to work!
            salary_list.append(jj.find_element(By.CLASS_NAME,"salary-snippet-container").text)

        except NoSuchElementException: 
            try: 
                salary_list.append(jj.find_element(By.CLASS_NAME,"estimated-salary").text)
                
            except NoSuchElementException:
                salary_list.append(None)
      
                
#         # Click the job element to get the description
#         job_title.click()
        
#         # Help to load page so we can find and extract data
#         sleep(randint(3, 5))

#         try: 
#             job_description_list.append(driver.find_element(By.ID,"jobDescriptionText").text)
            
#         except: 
            
#             job_description_list.append(None)

driver.quit() 


end = time.time()

print(end - start,'seconds to complete Query!')

# alternate way to grab the info for job description to make it faster:


In [16]:
job_lst[0:2]

[]

[]

In [81]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService

# Specify the path to the ChromeDriver executable
chrome_driver_path = 'C:\Users\Aviral Tanwar\Downloads\chrome-win64\chrome-win64'  # Update with the actual path

# Set up the ChromeService with the specified path
chrome_service = ChromeService(executable_path=chrome_driver_path)

# Set up the webdriver using the ChromeService
driver = webdriver.Chrome(service=chrome_service)

# Open the webpage
driver.get('https://in.indeed.com/jobs?q=Python+developer&l=&from=searchOnHP&vjk=71d709239fecfc67')

# Wait for the page to load (you may need to adjust the wait time)
driver.implicitly_wait(10)

# Use Selenium to interact with the page and extract salaries
salaries = driver.find_elements_by_css_selector('your_salary_selector')

for salary in salaries:
    print(salary.text)

# Close the browser
driver.quit()


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1196221903.py, line 5)