# Libraries and Requirements

In [None]:
!pip install pandas selenium webdriver_manager --quiet --upgrade

In [None]:
# selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException

# webdriver
from webdriver_manager.chrome import ChromeDriverManager

# time
from time import sleep
from datetime import datetime

# regex
import re

# dataframe manipulation
import numpy as np
import pandas as pd

# Scraping Data using Selenium

Naukri's URL...

In [None]:
URL = 'https://www.naukri.com/'

Installing the webdriver and opening the URL in test browser...

In [None]:
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = chrome_options)
driver.get(URL)
sleep(1)

Click on login button to enter the details...

In [None]:
driver.find_element(By.XPATH, '//*[@id="login_Layer"]').click()
sleep(1)

Identify the placeholders for credentials...

In [None]:
username = driver.find_element(By.CSS_SELECTOR, 'input[placeholder="Enter your active Email ID / Username"]')
password = driver.find_element(By.CSS_SELECTOR, 'input[placeholder="Enter your password"]')

In [None]:
username.clear()
password.clear()

Putting the credentials...

In [None]:
username.send_keys('aka.ssh.testing@gmail.com')
password.send_keys('(Akash#1999)')

Clicking the `Login` button...

In [None]:
driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]').click()
sleep(2)

Click on `Search jobs here` search bar...

In [None]:
driver.find_element(By.CSS_SELECTOR, 'button[class="nI-gNb-sb__icon-wrapper"]').click()

Identify place holder...

In [None]:
placeholder = driver.find_element(By.CSS_SELECTOR, 'input[placeholder="Enter keyword / designation / companies"]')

Take input from user...

In [None]:
user_inputs = input('Enter Keyword / Designation / Companies:')

Assigning user inputs to the place holder...

In [None]:
placeholder.send_keys(user_inputs)
sleep(2)

Clicking the search button...

In [None]:
driver.find_element(By.CSS_SELECTOR, 'button[class="nI-gNb-sb__icon-wrapper"]').click()
sleep(3)

Scraping jobs from first N pages...

In [None]:
jobs = []
N = 3

for _ in range(N):
    scraped_jobs = []
    scraped_jobs.extend(driver.find_elements(By.CLASS_NAME, 'srp-jobtuple-wrapper'))
    sleep(1)
    for index in range(len(scraped_jobs)):
        content = scraped_jobs[index].text
        jobs.append(content)
    
    driver.find_element(By.XPATH, '//*[@id="lastCompMark"]/a[2]').click()
    sleep(3)

Parsing the required details...

In [None]:
# parsing the required data
designation_name = []
company_name = []
experience = []
salary = []
location = []

for job in jobs:
    content = job.split('\n')
    # appending parsed data into unique lists
    designation_name.append(content[0])
    company_name.append(content[1])

    # using regex patterns to parse experience and salary
    experience_pattern = re.compile(r'(\d+)-(\d+) Yrs')
    experience_pattern_match = experience_pattern.findall(job)
    experience.append(experience_pattern_match[0] if experience_pattern_match else '')

    salary_pattern = re.compile(r'([\d.]+)-([\d.]+) Lac|Lacs|LPA|Not disclosed')
    salary_pattern_match = salary_pattern.findall(job)
    salary.append(salary_pattern_match[0] if salary_pattern_match else '')

    # Find the index of "Salary" in the content list
    salary_indices = [i for i, line in enumerate(content) if re.search(salary_pattern, line)]
    
    # If "Salary" is found, get the location from the next line
    if salary_indices:
        salary_index = salary_indices[0]
        if salary_index < len(content) - 1:
            location.append(content[salary_index + 1])
        else:
            location.append('')  # Handle the case where location is not available
    else:
        location.append('')  # Handle the case where salary is not available

    # Print the extracted information
    print('____________________________________________________________')
    print(f'Designation: {designation_name[-1]}')
    print(f'Company: {company_name[-1]}')
    print(f'Experience: {experience[-1]}')
    print(f'Salary: {salary[-1]}')
    print(f'Location: {location[-1]}')
    print('____________________________________________________________\n')

# Converting the entire process into a function

In [None]:
def scrape_naukri(designation, input_username, input_password,N= 10):
    # helps in logging in the website
    URL = 'https://www.naukri.com/'
    chrome_options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options = chrome_options)
    driver.get(URL)
    sleep(1)
    driver.find_element(By.XPATH, '//*[@id="login_Layer"]').click()
    sleep(1)
    username = driver.find_element(By.CSS_SELECTOR, 'input[placeholder="Enter your active Email ID / Username"]')
    password = driver.find_element(By.CSS_SELECTOR, 'input[placeholder="Enter your password"]')
    username.clear()
    password.clear()
    username.send_keys(input_username)
    password.send_keys(input_password)
    driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]').click()
    sleep(5)

    # searching the input designation
    driver.find_element(By.CSS_SELECTOR, 'button[class="nI-gNb-sb__icon-wrapper"]').click()
    placeholder = driver.find_element(By.CSS_SELECTOR, 'input[placeholder="Enter keyword / designation / companies"]')
    placeholder.send_keys(designation)
    driver.find_element(By.CSS_SELECTOR, 'button[class="nI-gNb-sb__icon-wrapper"]').click()
    sleep(3)

    # listing the jobs after scraping
    jobs = []
    for _ in range(N):
        scraped_jobs = []
        scraped_jobs.extend(driver.find_elements(By.CLASS_NAME, 'srp-jobtuple-wrapper'))
        sleep(1)
        for index in range(len(scraped_jobs)):
            content = scraped_jobs[index].text
            jobs.append(content)
        driver.find_element(By.XPATH, '//*[@id="lastCompMark"]/a[2]').click()
        sleep(3)

    # parsing the required data
    designation_name = []
    company_name = []
    experience = []
    salary = []
    location = []

    for job in jobs:
        content = job.split('\n')

        # appending parsed data into unique lists
        designation_name.append(content[0])
        company_name.append(content[1])

        # using regex patterns to parse experience and salary
        experience_pattern = re.compile(r'(\d+)-(\d+) Yrs')
        experience_pattern_match = experience_pattern.findall(job)

        # check if the pattern matched before accessing the elements
        if experience_pattern_match:
            experience.append(experience_pattern_match[0])
        else:
            experience.append(('',''))

        salary_pattern = re.compile(r'([\d.]+)-([\d.]+) Lac|Lacs|LPA|Not disclosed')
        salary_pattern_match = salary_pattern.findall(job)

        # check if the pattern matched before accessing the elements
        if salary_pattern_match:
            salary.append(salary_pattern_match[0])
        else:
            salary.append(('',''))

        # fetching salary index location
        salary_indices = [index for index, value in enumerate(content) if re.search(salary_pattern, value)]
        if salary_indices:
            salary_index = salary_indices[0]
            if salary_index < len(content) - 1:
                location.append(content[salary_index + 1])
            else:
                location.append(('',''))
        else:
            location.append(('',''))
    
    # making a data dictionary
    data_dictionary =  {'Designation Name': designation_name,
                        'Company Name': company_name,
                        'Experience': experience,
                        'Salary': salary,
                        'Location': location}

    # making dataframe using the data dictionary
    dataframe = pd.DataFrame(data_dictionary)
    
    return dataframe

Creating `DataFrame` for data scientist role...

In [None]:
DataFrame = scrape_naukri('Data Scientist', 'dummy_username', 'dummy_password')

In [None]:
DataFrame

In [None]:
file_name= f"naukri_{datetime.now().strftime('%Y-%m-%d')}.csv"

DataFrame.to_csv(file_name)