## Web Scraping Indeed for Data Analysts: Mastering the Data Collection Process

In [1]:
# Import necessary libraries
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

In [2]:
# Set up Chrome options to customize the browser behavior
options = webdriver.ChromeOptions() 

# Set user-agent to mimic a browser behavior
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36')

# Initialize a Chrome WebDriver instance with customized options
driver = webdriver.Chrome(options=options) 

# URL of the webpage to scrape
url = "https://ng.indeed.com/jobs?q=data+analyst&l=Nigeria&from=searchOnHP&vjk=6df200c744f62577"

# Open the URL in the Chrome WebDriver instance
driver.get(url)

# Sleep for 5 seconds to ensure the page loads completely before scraping
sleep(5)

# Get the HTML source code of the page after it has fully loaded
html = driver.page_source

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Find all job listings on the page
job_listings = soup.find_all('div', class_='job_seen_beacon')

# Initialize a list to store scraped records
records = []

In [3]:
def get_data(job_listing):
    # Extract job title
    title = job_listing.find("a").find("span").text.strip()
    
    # Extract company name if available, otherwise assign an empty string
    try:
        company = job_listing.find('span', class_='css-92r8pb eu4oa1w0').text.strip()
    except AttributeError:
        company = ''
    
    # Extract job location if available, otherwise assign an empty string
    try:
        location  = job_listing.find('div', class_='css-1p0sjhy eu4oa1w0').text.strip()
    except AttributeError:
        location = ''
        
    # Extract salary information if available, otherwise assign an empty string
    try:
        salary  = job_listing.find('div', class_='metadata salary-snippet-container css-5zy3wz eu4oa1w0').text.strip()
    except AttributeError:
        salary = ''
    
    # Extract job type if available, otherwise assign an empty string
    try:
        job_type = job_listing.find('div', class_='metadata css-5zy3wz eu4oa1w0').text.strip()
    except AttributeError:
        job_type = ''
    
    # Extract date posted
    date_posted = job_listing.find('span', class_='css-qvloho eu4oa1w0').text.strip()
    
    # Extract job summary
    summary = job_listing.find('div', class_='css-9446fg eu4oa1w0').text.strip()
    
    # Return a tuple containing all the extracted information
    return (title, company, location, salary, job_type, date_posted, summary)

In [4]:
# Loop to scrape data from multiple pages until there are no more pages available
while True:
    try:
        # Extract the URL of the next page if available
        url = 'https://ng.indeed.com/' + soup.find('a', {'aria-label':'Next Page'}).get('href')
    except AttributeError:
        # If there are no more pages available, break the loop
        break
    
    # Open the next page in the browser
    driver.get(url)
    
    # Get the HTML source code of the next page
    html = driver.page_source
    
    # Parse the HTML of the next page using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find all job listings on the next page
    job_listings = soup.find_all('div', class_='job_seen_beacon') 

    # Iterate through each job listing on the page
    for job_listing in job_listings:
        # Extract data from the current job listing
        record = get_data(job_listing)
        
        # Append the extracted data to the records list
        records.append(record)

# Close the Chrome WebDriver instance
driver.quit()

In [5]:
# Convert list of records into a DataFrame
df = pd.DataFrame(records, columns=['Title', 'Company', 'Location', 'Salary', 'Job Type', 'Date Posted', 'Summary'])

# Save DataFrame to a CSV file
df.to_csv('indeed_job_data.csv', index=False)

print("Data saved to job_data.csv")

Data saved to job_data.csv


In [6]:
df

Unnamed: 0,Title,Company,Location,Salary,Job Type,Date Posted,Summary
0,"PO, Research & Data Analyst",The Elevation Church,Nigeria,,Full-time,PostedJust posted,Train volunteers on the use of M&E frameworks ...
1,QA Tester - Intern,mDoc Healthcare,Lagos,,,EmployerActive 6 days ago,Advanced knowledge of data security and encryp...
2,Data Analyst - Healthcare,eMedicStore,Lagos,,,EmployerActive 5 days ago,Implement new data analysis methodologies and ...
3,HR Analyst,Jobrole Consulting Limited,Ikeja,"₦150,000 - ₦200,000 a month",Full-time,PostedPosted 2 days ago,Analyze and interpret data to generate actiona...
4,Business Intelligence Analyst,TSL Metroline Limited,Lagos,,,PostedPosted 1 day ago,Analyse business information/ data to identify...
...,...,...,...,...,...,...,...
198,"Senior Product Manager - Nerve (Lagos, Nigeria)",Kuda Technologies Ltd,Lagos,,Full-time,EmployerActive 9 days ago,"You will collaborate with engineers, designers..."
199,Robotic Process Automation (RPA),Oscar Temple,Lagos,,Full-time,EmployerActive 5 days ago,∙Collaborate with business analysts and stakeh...
200,Accounting Operations Manager,Deloitte Human Capital Consulting West Africa,Ibadan,,,PostedPosted 30+ days ago,Manage a team of analysts and officers to achi...
201,SAP Support Officer (Human Resources Management),Dangote Group,Lagos,,Full-time,PostedPosted 30+ days ago,Detailed knowledge of the SAP HCM master data....
